From 569bbf7397a3c31158582b0c2c4900e49e1fe193 Mon Sep 17 00:00:00 2001 From: Pavel Lutskov Date: Sun, 1 Dec 2019 09:44:40 -0800 Subject: [PATCH] tokenize FASTER --- bridge.pyx | 14 ++++++++++++-- library.py | 19 ++++++++++--------- main.c | 34 +++++++++++++++------------------- 3 files changed, 37 insertions(+), 30 deletions(-) diff --git a/bridge.pyx b/bridge.pyx index f7eed7c..84087c0 100644 --- a/bridge.pyx +++ b/bridge.pyx @@ -59,11 +59,21 @@ cdef public int get_tokens(WordList* wl, const char *filename): cdef public long vocab_idx_of(Word* w): word = w.data.decode('utf-8') try: - return nn.vocab.index(word) - except ValueError: + return nn.vocab[word] + except KeyError: return -1 +cdef public void f_idx_list_to_print(float* f_idxs, size_t num): + idxs = np.asarray(f_idxs).astype(np.int) + cdef str pyuni = ' '.join(nn.inv_vocab[i] for i in idxs) + print(pyuni) + # cdef bytes b = pyuni.encode('utf-8') + # cdef char* retval = malloc((len(b) + 1) * sizeof(char)) + # retval[len(b)] = 0 + # return retval + + cdef public void c_onehot(float* y, float* idxs, size_t n_idx): oh = nn.onehot(np.asarray(idxs)) ensure_contiguous(oh) diff --git a/library.py b/library.py index 7a92159..a573384 100644 --- a/library.py +++ b/library.py @@ -15,12 +15,13 @@ def word_tokenize(s: str): HERE = os.path.abspath(os.path.dirname(__file__)) CORPUS = os.path.join(HERE, 'melville-moby_dick.txt') -# sw = set(stopwords.words('english')) -sw = ['the'] -vocab = list(set( - w.lower() for w in word_tokenize(open(CORPUS).read()) - if w.isalpha() and not w.lower() in sw -)) +VOCAB = os.path.join(HERE, 'vocab.txt') + +vocab = { + w: i for i, w in enumerate(open(VOCAB).read().splitlines(keepends=False)) +} +# inv_vocab = [vocab[i] for i in range(len(vocab))] +inv_vocab = sorted(vocab, key=vocab.get) def create_mnist_network(): @@ -33,9 +34,9 @@ def create_mnist_network(): return model -def create_cbow_network(win, vocab, embed): +def create_cbow_network(win, vocsize, embed): ctxt = tf.keras.layers.Input(shape=[win]) - ed = tf.keras.layers.Embedding(vocab, embed, input_length=win)(ctxt) + ed = tf.keras.layers.Embedding(vocsize, embed, input_length=win)(ctxt) avgd = tf.keras.layers.Lambda(lambda x: tf.reduce_mean(x, axis=1))(ed) mod = tf.keras.Model(inputs=ctxt, outputs=avgd) mod.compile( @@ -47,7 +48,7 @@ def create_cbow_network(win, vocab, embed): def token_generator(filename): with open(filename) as f: - for l in f.readlines(): + for i, l in enumerate(f.readlines()): if not l.isspace(): tok = word_tokenize(l) if tok: diff --git a/main.c b/main.c index 8b85c25..34523ef 100644 --- a/main.c +++ b/main.c @@ -17,7 +17,7 @@ #define COMM 100 #define ITER 20 -#define BS 20 +#define BS 10 #define EMB 20 #define WIN 2 #define FSPC 1 @@ -99,14 +99,6 @@ Role map_node() { exit(1); // this is bad } -void free_weightlist(WeightList* wl) { - for in_range(i, wl->n_weights) { - free(wl->weights[i].shape); - free(wl->weights[i].W); - } - free(wl->weights); -} - void free_word(Word* w) { free(w->data); w->data = NULL; @@ -145,11 +137,9 @@ void tokenizer(const char* source) { while (get_tokens(&wl, source)) { for in_range(i, wl.n_words) { send_word(&wl.words[i], mpi_id_from_role_id(FILTERER, 0)); - // printf("OI %s\n", wl.words[i].data); } - // INFO_PRINTLN(""); } - Word terminator = {0, ""}; + Word terminator = {1, ""}; send_word(&terminator, mpi_id_from_role_id(FILTERER, 0)); free_wordlist(&wl); } @@ -162,9 +152,7 @@ void filterer() { if (!strlen(w.data)) { break; } - // INFO_PRINTF("%s: ", w.data); idx = vocab_idx_of(&w); - // INFO_PRINTF("%ld\n", idx); if (idx != -1) { MPI_Send(&idx, 1, MPI_LONG, mpi_id_from_role_id(BATCHER, 0), TAG_IWORD, MPI_COMM_WORLD); @@ -194,11 +182,11 @@ void batcher() { } if (l_wid == -1) break; - for in_range(i, n_words) { - INFO_PRINTF("%5.0f ", f_widx[i]); - } - - INFO_PRINTLN(""); + // f_idx_list_to_c_string(f_widx, n_words); + // for in_range(i, n_words) { + // INFO_PRINTF("%5.0f ", f_widx[i]); + // } + // INFO_PRINTLN(""); // MPI_Recv(&s, 1, MPI_INT, MPI_ANY_SOURCE, TAG_READY, MPI_COMM_WORLD, // MPI_STATUS_IGNORE); // if (s != -1) { @@ -208,6 +196,14 @@ void batcher() { free(f_widx); } +void free_weightlist(WeightList* wl) { + for in_range(i, wl->n_weights) { + free(wl->weights[i].shape); + free(wl->weights[i].W); + } + free(wl->weights); +} + void send_weights(const WeightList* wl, int dest, int tag) { // This assumes that the receiving end knows exactly // the number of elements being sent and has memory ready