Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion itn/chinese/inverse_normalizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def __init__(self,
enable_standalone_number=True,
enable_0_to_9=False,
enable_million=False):
super().__init__(name='inverse_normalizer', ordertype='itn')
super().__init__(name='zh_inverse_normalizer', ordertype='itn')
self.convert_number = enable_standalone_number
self.enable_0_to_9 = enable_0_to_9
self.enable_million = enable_million
Expand Down
19 changes: 13 additions & 6 deletions itn/chinese/rules/cardinal.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
# limitations under the License.

from tn.processor import Processor
from tn.utils import get_abs_path

from pynini import cross, accep, string_file
from pynini.lib.pynutil import delete, insert, add_weight
Expand All @@ -34,14 +35,20 @@ def __init__(self,
self.build_verbalizer()

def build_tagger(self):
zero = string_file('itn/chinese/data/number/zero.tsv') # 0
digit = string_file('itn/chinese/data/number/digit.tsv') # 1 ~ 9
zero = string_file(
get_abs_path('../itn/chinese/data/number/zero.tsv')) # 0
digit = string_file(
get_abs_path('../itn/chinese/data/number/digit.tsv')) # 1 ~ 9
special_tilde = string_file(
'itn/chinese//data/number/special_tilde.tsv') # 七八十->70~80
get_abs_path(
'../itn/chinese/data/number/special_tilde.tsv')) # 七八十->70~80
special_dash = string_file(
'itn/chinese//data/number/special_dash.tsv') # 七八十->70-80
sign = string_file('itn/chinese/data/number/sign.tsv') # + -
dot = string_file('itn/chinese/data/number/dot.tsv') # .
get_abs_path(
'../itn/chinese/data/number/special_dash.tsv')) # 七八十->70-80
sign = string_file(
get_abs_path('../itn/chinese/data/number/sign.tsv')) # + -
dot = string_file(
get_abs_path('../itn/chinese/data/number/dot.tsv')) # .

# 0. 基础数字
addzero = insert('0')
Expand Down
11 changes: 7 additions & 4 deletions itn/chinese/rules/date.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
# limitations under the License.

from tn.processor import Processor
from tn.utils import get_abs_path

from pynini import string_file, accep
from pynini.lib.pynutil import delete, insert
Expand All @@ -26,14 +27,16 @@ def __init__(self):
self.build_verbalizer()

def build_tagger(self):
digit = string_file('itn/chinese/data/number/digit.tsv') # 1 ~ 9
zero = string_file('itn/chinese/data/number/zero.tsv') # 0
digit = string_file(
get_abs_path('../itn/chinese/data/number/digit.tsv')) # 1 ~ 9
zero = string_file(
get_abs_path('../itn/chinese/data/number/zero.tsv')) # 0

yyyy = digit + (digit | zero)**3 # 二零零八年
yyy = digit + (digit | zero)**2 # 公元一六八年
yy = (digit | zero)**2 # 零八年奥运会
mm = string_file('itn/chinese/data/date/mm.tsv')
dd = string_file('itn/chinese/data/date/dd.tsv')
mm = string_file(get_abs_path('../itn/chinese/data/date/mm.tsv'))
dd = string_file(get_abs_path('../itn/chinese/data/date/dd.tsv'))

year = insert('year: "') + (yyyy | yyy | yy) + \
delete('年') + insert('" ')
Expand Down
4 changes: 3 additions & 1 deletion itn/chinese/rules/fraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

from itn.chinese.rules.cardinal import Cardinal
from tn.processor import Processor
from tn.utils import get_abs_path

from pynini import string_file
from pynini.lib.pynutil import delete, insert, add_weight
Expand All @@ -28,7 +29,8 @@ def __init__(self):

def build_tagger(self):
number = Cardinal().number
sign = string_file('itn/chinese/data/number/sign.tsv') # + -
sign = string_file(
get_abs_path('../itn/chinese/data/number/sign.tsv')) # + -

# NOTE(xcsong): default weight = 1.0, set to -1.0 means higher priority
# For example,
Expand Down
10 changes: 7 additions & 3 deletions itn/chinese/rules/license_plate.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
# limitations under the License.

from tn.processor import Processor
from tn.utils import get_abs_path

from pynini import string_file
from pynini.lib.pynutil import insert
Expand All @@ -26,11 +27,14 @@ def __init__(self):
self.build_verbalizer()

def build_tagger(self):
digit = string_file('itn/chinese/data/number/digit.tsv') # 1 ~ 9
zero = string_file('itn/chinese/data/number/zero.tsv') # 0
digit = string_file(
get_abs_path('../itn/chinese/data/number/digit.tsv')) # 1 ~ 9
zero = string_file(
get_abs_path('../itn/chinese/data/number/zero.tsv')) # 0
digits = zero | digit
province = string_file(
'itn/chinese/data/license_plate/province.tsv') # 皖
get_abs_path(
'../itn/chinese/data/license_plate/province.tsv')) # 皖
license_plate = province + self.ALPHA + (self.ALPHA | digits)**5
license_plate |= province + self.ALPHA + (self.ALPHA | digits)**6
tagger = insert('value: "') + license_plate + insert('"')
Expand Down
4 changes: 3 additions & 1 deletion itn/chinese/rules/math.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

from itn.chinese.rules.cardinal import Cardinal
from tn.processor import Processor
from tn.utils import get_abs_path

from pynini import string_file
from pynini.lib.pynutil import insert
Expand All @@ -27,7 +28,8 @@ def __init__(self):
self.build_verbalizer()

def build_tagger(self):
operator = string_file('itn/chinese/data/math/operator.tsv')
operator = string_file(
get_abs_path('../itn/chinese/data/math/operator.tsv'))

number = Cardinal().number
tagger = (number + (operator + number).plus)
Expand Down
10 changes: 7 additions & 3 deletions itn/chinese/rules/measure.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

from itn.chinese.rules.cardinal import Cardinal
from tn.processor import Processor
from tn.utils import get_abs_path

from pynini import string_file, accep, cross
from pynini.lib.pynutil import delete, insert, add_weight
Expand All @@ -29,9 +30,12 @@ def __init__(self, exclude_one=True, enable_0_to_9=True):
self.build_verbalizer()

def build_tagger(self):
units_en = string_file('itn/chinese/data/measure/units_en.tsv')
units_zh = string_file('itn/chinese/data/measure/units_zh.tsv')
sign = string_file('itn/chinese/data/number/sign.tsv') # + -
units_en = string_file(
get_abs_path('../itn/chinese/data/measure/units_en.tsv'))
units_zh = string_file(
get_abs_path('../itn/chinese/data/measure/units_zh.tsv'))
sign = string_file(
get_abs_path('../itn/chinese/data/number/sign.tsv')) # + -
to = cross('到', '~') | cross('到百分之', '~')

units = add_weight(
Expand Down
9 changes: 6 additions & 3 deletions itn/chinese/rules/money.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

from itn.chinese.rules.cardinal import Cardinal
from tn.processor import Processor
from tn.utils import get_abs_path

from pynini import string_file
from pynini.lib.pynutil import delete, insert
Expand All @@ -28,9 +29,11 @@ def __init__(self, enable_0_to_9=True):
self.build_verbalizer()

def build_tagger(self):
code = string_file('itn/chinese/data/money/code.tsv')
symbol = string_file('itn/chinese/data/money/symbol.tsv')
digit = string_file('itn/chinese/data/number/digit.tsv') # 1 ~ 9
code = string_file(get_abs_path('../itn/chinese/data/money/code.tsv'))
symbol = string_file(
get_abs_path('../itn/chinese/data/money/symbol.tsv'))
digit = string_file(
get_abs_path('../itn/chinese/data/number/digit.tsv')) # 1 ~ 9

number = Cardinal().number if self.enable_0_to_9 else \
Cardinal().number_exclude_0_to_9
Expand Down
4 changes: 3 additions & 1 deletion itn/chinese/rules/postprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
# limitations under the License.

from tn.processor import Processor
from tn.utils import get_abs_path

from pynini import string_file
from pynini.lib.pynutil import delete
Expand All @@ -23,7 +24,8 @@ class PostProcessor(Processor):

def __init__(self, remove_interjections=True):
super().__init__(name='postprocessor')
blacklist = string_file('itn/chinese/data/default/blacklist.tsv')
blacklist = string_file(
get_abs_path('../itn/chinese/data/default/blacklist.tsv'))

processor = self.VSIGMA
if remove_interjections:
Expand Down
9 changes: 5 additions & 4 deletions itn/chinese/rules/time.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
# limitations under the License.

from tn.processor import Processor
from tn.utils import get_abs_path

from pynini import string_file
from pynini.lib.pynutil import delete, insert
Expand All @@ -26,10 +27,10 @@ def __init__(self):
self.build_verbalizer()

def build_tagger(self):
h = string_file('itn/chinese/data/time/hour.tsv')
m = string_file('itn/chinese/data/time/minute.tsv')
s = string_file('itn/chinese/data/time/second.tsv')
noon = string_file('itn/chinese/data/time/noon.tsv')
h = string_file(get_abs_path('../itn/chinese/data/time/hour.tsv'))
m = string_file(get_abs_path('../itn/chinese/data/time/minute.tsv'))
s = string_file(get_abs_path('../itn/chinese/data/time/second.tsv'))
noon = string_file(get_abs_path('../itn/chinese/data/time/noon.tsv'))

tagger = ((insert('noon: "') + noon + insert('" ')).ques +
insert('hour: "') + h + insert('"') + insert(' minute: "') +
Expand Down
4 changes: 3 additions & 1 deletion itn/chinese/rules/whitelist.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
# limitations under the License.

from tn.processor import Processor
from tn.utils import get_abs_path

from pynini import string_file
from pynini.lib.pynutil import insert
Expand All @@ -26,7 +27,8 @@ def __init__(self):
self.build_verbalizer()

def build_tagger(self):
whitelist = string_file('itn/chinese/data/default/whitelist.tsv')
whitelist = string_file(
get_abs_path('../itn/chinese/data/default/whitelist.tsv'))

tagger = insert('value: "') + whitelist + insert('"')
self.tagger = self.add_tokens(tagger)
7 changes: 5 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,11 @@
url="https://github.com/wenet-e2e/WeTextProcessing",
packages=find_packages(),
package_data={
"tn": ["*.fst"],
"itn": ["*.fst"],
"tn": [
"*.fst", "chinese/data/*/*.tsv", "english/data/*/*.tsv",
"english/data/*.tsv", "english/data/*/*.far"
],
"itn": ["*.fst", "chinese/data/*/*.tsv"],
},
install_requires=['pynini==2.1.5', 'importlib_resources'],
entry_points={
Expand Down
2 changes: 1 addition & 1 deletion tn/chinese/normalizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def __init__(self,
remove_puncts=False,
full_to_half=True,
tag_oov=False):
super().__init__(name='normalizer')
super().__init__(name='zh_normalizer')
self.remove_interjections = remove_interjections
self.remove_erhua = remove_erhua
self.traditional_to_simple = traditional_to_simple
Expand Down
11 changes: 6 additions & 5 deletions tn/chinese/rules/cardinal.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
# limitations under the License.

from tn.processor import Processor
from tn.utils import get_abs_path

from pynini import accep, cross, string_file
from pynini.lib.pynutil import add_weight, delete, insert
Expand All @@ -28,11 +29,11 @@ def __init__(self):
self.build_verbalizer()

def build_tagger(self):
zero = string_file('tn/chinese/data/number/zero.tsv')
digit = string_file('tn/chinese/data/number/digit.tsv')
teen = string_file('tn/chinese/data/number/teen.tsv')
sign = string_file('tn/chinese/data/number/sign.tsv')
dot = string_file('tn/chinese/data/number/dot.tsv')
zero = string_file(get_abs_path('chinese/data/number/zero.tsv'))
digit = string_file(get_abs_path('chinese/data/number/digit.tsv'))
teen = string_file(get_abs_path('chinese/data/number/teen.tsv'))
sign = string_file(get_abs_path('chinese/data/number/sign.tsv'))
dot = string_file(get_abs_path('chinese/data/number/dot.tsv'))

rmzero = delete('0') | delete('0')
rmpunct = delete(',').ques
Expand Down
13 changes: 7 additions & 6 deletions tn/chinese/rules/date.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
# limitations under the License.

from tn.processor import Processor
from tn.utils import get_abs_path

from pynini import string_file
from pynini.lib.pynutil import delete, insert
Expand All @@ -26,14 +27,14 @@ def __init__(self):
self.build_verbalizer()

def build_tagger(self):
digit = string_file('tn/chinese/data/number/digit.tsv')
zero = string_file('tn/chinese/data/number/zero.tsv')
digit = string_file(get_abs_path('chinese/data/number/digit.tsv'))
zero = string_file(get_abs_path('chinese/data/number/zero.tsv'))

yyyy = digit + (digit | zero)**3
m = string_file('tn/chinese/data/date/m.tsv')
mm = string_file('tn/chinese/data/date/mm.tsv')
d = string_file('tn/chinese/data/date/d.tsv')
dd = string_file('tn/chinese/data/date/dd.tsv')
m = string_file(get_abs_path('chinese/data/date/m.tsv'))
mm = string_file(get_abs_path('chinese/data/date/mm.tsv'))
d = string_file(get_abs_path('chinese/data/date/d.tsv'))
dd = string_file(get_abs_path('chinese/data/date/dd.tsv'))
rmsign = (delete('/') | delete('-') | delete('.')) + insert(' ')

year = insert('year: "') + yyyy + insert('年"')
Expand Down
20 changes: 9 additions & 11 deletions tn/chinese/rules/math.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

from tn.chinese.rules.cardinal import Cardinal
from tn.processor import Processor
from tn.utils import get_abs_path

from pynini import cross, string_file
from pynini.lib.pynutil import delete, insert
Expand All @@ -27,20 +28,17 @@ def __init__(self):
self.build_verbalizer()

def build_tagger(self):
operator = string_file("tn/chinese/data/math/operator.tsv")
operator = string_file(get_abs_path("chinese/data/math/operator.tsv"))
# When it appears alone, it is treated as punctuation
symbols = (
cross("~", "到")
| cross(":", "比")
| cross("<", "小于")
| cross(">", "大于")
)
symbols = (cross("~", "到")
| cross(":", "比")
| cross("<", "小于")
| cross(">", "大于"))

number = Cardinal().number
tagger = (
number
+ (delete(" ").ques + (operator | symbols) + delete(" ").ques + number).star
)
tagger = (number +
(delete(" ").ques +
(operator | symbols) + delete(" ").ques + number).star)
tagger |= operator
tagger = insert('value: "') + tagger + insert('"')
self.tagger = self.add_tokens(tagger)
7 changes: 5 additions & 2 deletions tn/chinese/rules/measure.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

from tn.chinese.rules.cardinal import Cardinal
from tn.processor import Processor
from tn.utils import get_abs_path

from pynini import accep, cross, string_file
from pynini.lib.pynutil import delete, insert, add_weight
Expand All @@ -27,8 +28,10 @@ def __init__(self):
self.build_verbalizer()

def build_tagger(self):
units_en = string_file('tn/chinese/data/measure/units_en.tsv')
units_zh = string_file('tn/chinese/data/measure/units_zh.tsv')
units_en = string_file(
get_abs_path('chinese/data/measure/units_en.tsv'))
units_zh = string_file(
get_abs_path('chinese/data/measure/units_zh.tsv'))
units = add_weight((cross("k", "千") | cross("w", "万")), 0.1).ques + \
(units_en | units_zh)
rmspace = delete(' ').ques
Expand Down
5 changes: 3 additions & 2 deletions tn/chinese/rules/money.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

from tn.chinese.rules.cardinal import Cardinal
from tn.processor import Processor
from tn.utils import get_abs_path

from pynini import string_file
from pynini.lib.pynutil import delete, insert
Expand All @@ -27,8 +28,8 @@ def __init__(self):
self.build_verbalizer()

def build_tagger(self):
code = string_file('tn/chinese/data/money/code.tsv')
symbol = string_file('tn/chinese/data/money/symbol.tsv')
code = string_file(get_abs_path('chinese/data/money/code.tsv'))
symbol = string_file(get_abs_path('chinese/data/money/symbol.tsv'))

number = Cardinal().number
tagger = (insert('currency: "') + (code | symbol) + delete(' ').ques +
Expand Down
Loading