tokenize FASTER

This commit is contained in:
2019-12-01 09:44:40 -08:00
parent 7409eca38b
commit 569bbf7397
3 changed files with 37 additions and 30 deletions

View File

@@ -59,11 +59,21 @@ cdef public int get_tokens(WordList* wl, const char *filename):
cdef public long vocab_idx_of(Word* w): cdef public long vocab_idx_of(Word* w):
word = w.data.decode('utf-8') word = w.data.decode('utf-8')
try: try:
return nn.vocab.index(word) return nn.vocab[word]
except ValueError: except KeyError:
return -1 return -1
cdef public void f_idx_list_to_print(float* f_idxs, size_t num):
idxs = np.asarray(<float[:num]>f_idxs).astype(np.int)
cdef str pyuni = ' '.join(nn.inv_vocab[i] for i in idxs)
print(pyuni)
# cdef bytes b = pyuni.encode('utf-8')
# cdef char* retval = <char*>malloc((len(b) + 1) * sizeof(char))
# retval[len(b)] = 0
# return retval
cdef public void c_onehot(float* y, float* idxs, size_t n_idx): cdef public void c_onehot(float* y, float* idxs, size_t n_idx):
oh = nn.onehot(np.asarray(<float[:n_idx]>idxs)) oh = nn.onehot(np.asarray(<float[:n_idx]>idxs))
ensure_contiguous(oh) ensure_contiguous(oh)

View File

@@ -15,12 +15,13 @@ def word_tokenize(s: str):
HERE = os.path.abspath(os.path.dirname(__file__)) HERE = os.path.abspath(os.path.dirname(__file__))
CORPUS = os.path.join(HERE, 'melville-moby_dick.txt') CORPUS = os.path.join(HERE, 'melville-moby_dick.txt')
# sw = set(stopwords.words('english')) VOCAB = os.path.join(HERE, 'vocab.txt')
sw = ['the']
vocab = list(set( vocab = {
w.lower() for w in word_tokenize(open(CORPUS).read()) w: i for i, w in enumerate(open(VOCAB).read().splitlines(keepends=False))
if w.isalpha() and not w.lower() in sw }
)) # inv_vocab = [vocab[i] for i in range(len(vocab))]
inv_vocab = sorted(vocab, key=vocab.get)
def create_mnist_network(): def create_mnist_network():
@@ -33,9 +34,9 @@ def create_mnist_network():
return model return model
def create_cbow_network(win, vocab, embed): def create_cbow_network(win, vocsize, embed):
ctxt = tf.keras.layers.Input(shape=[win]) ctxt = tf.keras.layers.Input(shape=[win])
ed = tf.keras.layers.Embedding(vocab, embed, input_length=win)(ctxt) ed = tf.keras.layers.Embedding(vocsize, embed, input_length=win)(ctxt)
avgd = tf.keras.layers.Lambda(lambda x: tf.reduce_mean(x, axis=1))(ed) avgd = tf.keras.layers.Lambda(lambda x: tf.reduce_mean(x, axis=1))(ed)
mod = tf.keras.Model(inputs=ctxt, outputs=avgd) mod = tf.keras.Model(inputs=ctxt, outputs=avgd)
mod.compile( mod.compile(
@@ -47,7 +48,7 @@ def create_cbow_network(win, vocab, embed):
def token_generator(filename): def token_generator(filename):
with open(filename) as f: with open(filename) as f:
for l in f.readlines(): for i, l in enumerate(f.readlines()):
if not l.isspace(): if not l.isspace():
tok = word_tokenize(l) tok = word_tokenize(l)
if tok: if tok:

34
main.c
View File

@@ -17,7 +17,7 @@
#define COMM 100 #define COMM 100
#define ITER 20 #define ITER 20
#define BS 20 #define BS 10
#define EMB 20 #define EMB 20
#define WIN 2 #define WIN 2
#define FSPC 1 #define FSPC 1
@@ -99,14 +99,6 @@ Role map_node() {
exit(1); // this is bad exit(1); // this is bad
} }
void free_weightlist(WeightList* wl) {
for in_range(i, wl->n_weights) {
free(wl->weights[i].shape);
free(wl->weights[i].W);
}
free(wl->weights);
}
void free_word(Word* w) { void free_word(Word* w) {
free(w->data); free(w->data);
w->data = NULL; w->data = NULL;
@@ -145,11 +137,9 @@ void tokenizer(const char* source) {
while (get_tokens(&wl, source)) { while (get_tokens(&wl, source)) {
for in_range(i, wl.n_words) { for in_range(i, wl.n_words) {
send_word(&wl.words[i], mpi_id_from_role_id(FILTERER, 0)); send_word(&wl.words[i], mpi_id_from_role_id(FILTERER, 0));
// printf("OI %s\n", wl.words[i].data);
} }
// INFO_PRINTLN("");
} }
Word terminator = {0, ""}; Word terminator = {1, ""};
send_word(&terminator, mpi_id_from_role_id(FILTERER, 0)); send_word(&terminator, mpi_id_from_role_id(FILTERER, 0));
free_wordlist(&wl); free_wordlist(&wl);
} }
@@ -162,9 +152,7 @@ void filterer() {
if (!strlen(w.data)) { if (!strlen(w.data)) {
break; break;
} }
// INFO_PRINTF("%s: ", w.data);
idx = vocab_idx_of(&w); idx = vocab_idx_of(&w);
// INFO_PRINTF("%ld\n", idx);
if (idx != -1) { if (idx != -1) {
MPI_Send(&idx, 1, MPI_LONG, mpi_id_from_role_id(BATCHER, 0), MPI_Send(&idx, 1, MPI_LONG, mpi_id_from_role_id(BATCHER, 0),
TAG_IWORD, MPI_COMM_WORLD); TAG_IWORD, MPI_COMM_WORLD);
@@ -194,11 +182,11 @@ void batcher() {
} }
if (l_wid == -1) break; if (l_wid == -1) break;
for in_range(i, n_words) { // f_idx_list_to_c_string(f_widx, n_words);
INFO_PRINTF("%5.0f ", f_widx[i]); // for in_range(i, n_words) {
} // INFO_PRINTF("%5.0f ", f_widx[i]);
// }
INFO_PRINTLN(""); // INFO_PRINTLN("");
// MPI_Recv(&s, 1, MPI_INT, MPI_ANY_SOURCE, TAG_READY, MPI_COMM_WORLD, // MPI_Recv(&s, 1, MPI_INT, MPI_ANY_SOURCE, TAG_READY, MPI_COMM_WORLD,
// MPI_STATUS_IGNORE); // MPI_STATUS_IGNORE);
// if (s != -1) { // if (s != -1) {
@@ -208,6 +196,14 @@ void batcher() {
free(f_widx); free(f_widx);
} }
void free_weightlist(WeightList* wl) {
for in_range(i, wl->n_weights) {
free(wl->weights[i].shape);
free(wl->weights[i].W);
}
free(wl->weights);
}
void send_weights(const WeightList* wl, int dest, int tag) { void send_weights(const WeightList* wl, int dest, int tag) {
// This assumes that the receiving end knows exactly // This assumes that the receiving end knows exactly
// the number of elements being sent and has memory ready // the number of elements being sent and has memory ready