From 569bbf7397a3c31158582b0c2c4900e49e1fe193 Mon Sep 17 00:00:00 2001
From: Pavel Lutskov <pavel.lutskov@gmail.com>
Date: Sun, 1 Dec 2019 09:44:40 -0800
Subject: [PATCH] tokenize FASTER

---
 bridge.pyx | 14 ++++++++++++--
 library.py | 19 ++++++++++---------
 main.c     | 34 +++++++++++++++-------------------
 3 files changed, 37 insertions(+), 30 deletions(-)

diff --git a/bridge.pyx b/bridge.pyx
index f7eed7c..84087c0 100644
--- a/bridge.pyx
+++ b/bridge.pyx
@@ -59,11 +59,21 @@ cdef public int get_tokens(WordList* wl, const char *filename):
 cdef public long vocab_idx_of(Word* w):
     word = w.data.decode('utf-8')
     try:
-        return nn.vocab.index(word)
-    except ValueError:
+        return nn.vocab[word]
+    except KeyError:
         return -1
 
 
+cdef public void f_idx_list_to_print(float* f_idxs, size_t num):
+    idxs = np.asarray(<float[:num]>f_idxs).astype(np.int)
+    cdef str pyuni = ' '.join(nn.inv_vocab[i] for i in idxs)
+    print(pyuni)
+    # cdef bytes b = pyuni.encode('utf-8')
+    # cdef char* retval = <char*>malloc((len(b) + 1) * sizeof(char))
+    # retval[len(b)] = 0
+    # return retval
+
+
 cdef public void c_onehot(float* y, float* idxs, size_t n_idx):
     oh = nn.onehot(np.asarray(<float[:n_idx]>idxs))
     ensure_contiguous(oh)
diff --git a/library.py b/library.py
index 7a92159..a573384 100644
--- a/library.py
+++ b/library.py
@@ -15,12 +15,13 @@ def word_tokenize(s: str):
 
 HERE = os.path.abspath(os.path.dirname(__file__))
 CORPUS = os.path.join(HERE, 'melville-moby_dick.txt')
-# sw = set(stopwords.words('english'))
-sw = ['the']
-vocab = list(set(
-    w.lower() for w in word_tokenize(open(CORPUS).read())
-    if w.isalpha() and not w.lower() in sw
-))
+VOCAB = os.path.join(HERE, 'vocab.txt')
+
+vocab = {
+    w: i for i, w in enumerate(open(VOCAB).read().splitlines(keepends=False))
+}
+# inv_vocab = [vocab[i] for i in range(len(vocab))]
+inv_vocab = sorted(vocab, key=vocab.get)
 
 
 def create_mnist_network():
@@ -33,9 +34,9 @@ def create_mnist_network():
     return model
 
 
-def create_cbow_network(win, vocab, embed):
+def create_cbow_network(win, vocsize, embed):
     ctxt = tf.keras.layers.Input(shape=[win])
-    ed = tf.keras.layers.Embedding(vocab, embed, input_length=win)(ctxt)
+    ed = tf.keras.layers.Embedding(vocsize, embed, input_length=win)(ctxt)
     avgd = tf.keras.layers.Lambda(lambda x: tf.reduce_mean(x, axis=1))(ed)
     mod = tf.keras.Model(inputs=ctxt, outputs=avgd)
     mod.compile(
@@ -47,7 +48,7 @@ def create_cbow_network(win, vocab, embed):
 
 def token_generator(filename):
     with open(filename) as f:
-        for l in f.readlines():
+        for i, l in enumerate(f.readlines()):
             if not l.isspace():
                 tok = word_tokenize(l)
                 if tok:
diff --git a/main.c b/main.c
index 8b85c25..34523ef 100644
--- a/main.c
+++ b/main.c
@@ -17,7 +17,7 @@
 
 #define COMM 100
 #define ITER 20
-#define BS 20
+#define BS 10
 #define EMB 20
 #define WIN 2
 #define FSPC 1
@@ -99,14 +99,6 @@ Role map_node() {
     exit(1);  // this is bad
 }
 
-void free_weightlist(WeightList* wl) {
-    for in_range(i, wl->n_weights) {
-        free(wl->weights[i].shape);
-        free(wl->weights[i].W);
-    }
-    free(wl->weights);
-}
-
 void free_word(Word* w) {
     free(w->data);
     w->data = NULL;
@@ -145,11 +137,9 @@ void tokenizer(const char* source) {
     while (get_tokens(&wl, source)) {
         for in_range(i, wl.n_words) {
             send_word(&wl.words[i], mpi_id_from_role_id(FILTERER, 0));
-            // printf("OI %s\n", wl.words[i].data);
         }
-        // INFO_PRINTLN("");
     }
-    Word terminator = {0, ""};
+    Word terminator = {1, ""};
     send_word(&terminator, mpi_id_from_role_id(FILTERER, 0));
     free_wordlist(&wl);
 }
@@ -162,9 +152,7 @@ void filterer() {
         if (!strlen(w.data)) {
             break;
         }
-        // INFO_PRINTF("%s: ", w.data);
         idx = vocab_idx_of(&w);
-        // INFO_PRINTF("%ld\n", idx);
         if (idx != -1) {
             MPI_Send(&idx, 1, MPI_LONG, mpi_id_from_role_id(BATCHER, 0),
                     TAG_IWORD, MPI_COMM_WORLD);
@@ -194,11 +182,11 @@ void batcher() {
         }
         if (l_wid == -1) break;
 
-        for in_range(i, n_words) {
-            INFO_PRINTF("%5.0f ", f_widx[i]);
-        }
-
-        INFO_PRINTLN("");
+        // f_idx_list_to_c_string(f_widx, n_words);
+        // for in_range(i, n_words) {
+            // INFO_PRINTF("%5.0f ", f_widx[i]);
+        // }
+        // INFO_PRINTLN("");
         // MPI_Recv(&s, 1, MPI_INT, MPI_ANY_SOURCE, TAG_READY, MPI_COMM_WORLD,
                 // MPI_STATUS_IGNORE);
         // if (s != -1) {
@@ -208,6 +196,14 @@ void batcher() {
     free(f_widx);
 }
 
+void free_weightlist(WeightList* wl) {
+    for in_range(i, wl->n_weights) {
+        free(wl->weights[i].shape);
+        free(wl->weights[i].W);
+    }
+    free(wl->weights);
+}
+
 void send_weights(const WeightList* wl, int dest, int tag) {
     // This assumes that the receiving end knows exactly
     // the number of elements being sent and has memory ready