-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathnltk_utils.py
More file actions
33 lines (26 loc) · 861 Bytes
/
nltk_utils.py
File metadata and controls
33 lines (26 loc) · 861 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
import nltk
import numpy as np
from nltk.stem import SnowballStemmer
from torchtext.data import get_tokenizer
#nltk.download('perluniprops')
#nltk.download('nonbreaking_prefixes')
#nltk.download('punkt')
tokenizer = get_tokenizer("toktok", language='es')
stemmer = SnowballStemmer('spanish')
def tokenize(sentence):
return tokenizer(sentence)
def stem(word):
return stemmer.stem(word.lower())
def bag_of_words(tokenized_sentence, all_words):
"""
tokenized_sentence = ["tu", "que", "tal"]
all_words = ["tu", "yo", "soy", "que", "tal"]
bag = [ 1, 0, 0, 1, 1]
"""
tokenized_sentence = [stem(w) for w in tokenized_sentence]
#print(tokenized_sentence)
bag = np.zeros_like(all_words, dtype=np.float32)
for idx, w in enumerate(all_words):
if w in tokenized_sentence:
bag[idx] = 1.0
return bag