tokenize FASTER

2019-12-01 09:44:40 -08:00
parent 7409eca38b
commit 569bbf7397
3 changed files with 37 additions and 30 deletions
--- a/bridge.pyx
+++ b/bridge.pyx
@@ -59,11 +59,21 @@ cdef public int get_tokens(WordList* wl, const char *filename):
 cdef public long vocab_idx_of(Word* w):
    word = w.data.decode('utf-8')
    try:
-        return nn.vocab.index(word)
-    except ValueError:
+        return nn.vocab[word]
+    except KeyError:
        return -1


+cdef public void f_idx_list_to_print(float* f_idxs, size_t num):
+    idxs = np.asarray(<float[:num]>f_idxs).astype(np.int)
+    cdef str pyuni = ' '.join(nn.inv_vocab[i] for i in idxs)
+    print(pyuni)
+    # cdef bytes b = pyuni.encode('utf-8')
+    # cdef char* retval = <char*>malloc((len(b) + 1) * sizeof(char))
+    # retval[len(b)] = 0
+    # return retval
+
+
 cdef public void c_onehot(float* y, float* idxs, size_t n_idx):
    oh = nn.onehot(np.asarray(<float[:n_idx]>idxs))
    ensure_contiguous(oh)
--- a/library.py
+++ b/library.py
@@ -15,12 +15,13 @@ def word_tokenize(s: str):

 HERE = os.path.abspath(os.path.dirname(__file__))
 CORPUS = os.path.join(HERE, 'melville-moby_dick.txt')
-# sw = set(stopwords.words('english'))
-sw = ['the']
-vocab = list(set(
-    w.lower() for w in word_tokenize(open(CORPUS).read())
-    if w.isalpha() and not w.lower() in sw
-))
+VOCAB = os.path.join(HERE, 'vocab.txt')
+
+vocab = {
+    w: i for i, w in enumerate(open(VOCAB).read().splitlines(keepends=False))
+}
+# inv_vocab = [vocab[i] for i in range(len(vocab))]
+inv_vocab = sorted(vocab, key=vocab.get)


 def create_mnist_network():
@@ -33,9 +34,9 @@ def create_mnist_network():
    return model


-def create_cbow_network(win, vocab, embed):
+def create_cbow_network(win, vocsize, embed):
    ctxt = tf.keras.layers.Input(shape=[win])
-    ed = tf.keras.layers.Embedding(vocab, embed, input_length=win)(ctxt)
+    ed = tf.keras.layers.Embedding(vocsize, embed, input_length=win)(ctxt)
    avgd = tf.keras.layers.Lambda(lambda x: tf.reduce_mean(x, axis=1))(ed)
    mod = tf.keras.Model(inputs=ctxt, outputs=avgd)
    mod.compile(
@@ -47,7 +48,7 @@ def create_cbow_network(win, vocab, embed):

 def token_generator(filename):
    with open(filename) as f:
-        for l in f.readlines():
+        for i, l in enumerate(f.readlines()):
            if not l.isspace():
                tok = word_tokenize(l)
                if tok:
--- a/main.c
+++ b/main.c
@@ -17,7 +17,7 @@

 #define COMM 100
 #define ITER 20
-#define BS 20
+#define BS 10
 #define EMB 20
 #define WIN 2
 #define FSPC 1
@@ -99,14 +99,6 @@ Role map_node() {
    exit(1);  // this is bad
 }

-void free_weightlist(WeightList* wl) {
-    for in_range(i, wl->n_weights) {
-        free(wl->weights[i].shape);
-        free(wl->weights[i].W);
-    }
-    free(wl->weights);
-}
-
 void free_word(Word* w) {
    free(w->data);
    w->data = NULL;
@@ -145,11 +137,9 @@ void tokenizer(const char* source) {
    while (get_tokens(&wl, source)) {
        for in_range(i, wl.n_words) {
            send_word(&wl.words[i], mpi_id_from_role_id(FILTERER, 0));
-            // printf("OI %s\n", wl.words[i].data);
        }
-        // INFO_PRINTLN("");
    }
-    Word terminator = {0, ""};
+    Word terminator = {1, ""};
    send_word(&terminator, mpi_id_from_role_id(FILTERER, 0));
    free_wordlist(&wl);
 }
@@ -162,9 +152,7 @@ void filterer() {
        if (!strlen(w.data)) {
            break;
        }
-        // INFO_PRINTF("%s: ", w.data);
        idx = vocab_idx_of(&w);
-        // INFO_PRINTF("%ld\n", idx);
        if (idx != -1) {
            MPI_Send(&idx, 1, MPI_LONG, mpi_id_from_role_id(BATCHER, 0),
                    TAG_IWORD, MPI_COMM_WORLD);
@@ -194,11 +182,11 @@ void batcher() {
        }
        if (l_wid == -1) break;

-        for in_range(i, n_words) {
-            INFO_PRINTF("%5.0f ", f_widx[i]);
-        }
-
-        INFO_PRINTLN("");
+        // f_idx_list_to_c_string(f_widx, n_words);
+        // for in_range(i, n_words) {
+            // INFO_PRINTF("%5.0f ", f_widx[i]);
+        // }
+        // INFO_PRINTLN("");
        // MPI_Recv(&s, 1, MPI_INT, MPI_ANY_SOURCE, TAG_READY, MPI_COMM_WORLD,
                // MPI_STATUS_IGNORE);
        // if (s != -1) {
@@ -208,6 +196,14 @@ void batcher() {
    free(f_widx);
 }

+void free_weightlist(WeightList* wl) {
+    for in_range(i, wl->n_weights) {
+        free(wl->weights[i].shape);
+        free(wl->weights[i].W);
+    }
+    free(wl->weights);
+}
+
 void send_weights(const WeightList* wl, int dest, int tag) {
    // This assumes that the receiving end knows exactly
    // the number of elements being sent and has memory ready