tokenize FASTER

This commit is contained in:
2019-12-01 09:44:40 -08:00
parent 7409eca38b
commit 569bbf7397
3 changed files with 37 additions and 30 deletions

View File

@@ -59,11 +59,21 @@ cdef public int get_tokens(WordList* wl, const char *filename):
cdef public long vocab_idx_of(Word* w):
word = w.data.decode('utf-8')
try:
return nn.vocab.index(word)
except ValueError:
return nn.vocab[word]
except KeyError:
return -1
cdef public void f_idx_list_to_print(float* f_idxs, size_t num):
idxs = np.asarray(<float[:num]>f_idxs).astype(np.int)
cdef str pyuni = ' '.join(nn.inv_vocab[i] for i in idxs)
print(pyuni)
# cdef bytes b = pyuni.encode('utf-8')
# cdef char* retval = <char*>malloc((len(b) + 1) * sizeof(char))
# retval[len(b)] = 0
# return retval
cdef public void c_onehot(float* y, float* idxs, size_t n_idx):
oh = nn.onehot(np.asarray(<float[:n_idx]>idxs))
ensure_contiguous(oh)