diff --git a/bridge.pyx b/bridge.pyx index d2025ae..9c9cd59 100644 --- a/bridge.pyx +++ b/bridge.pyx @@ -9,8 +9,9 @@ from libc.string cimport memcpy import library as nn -X_train, y_train, X_test, y_test = nn.load_mnist() tokenizers = {} +X_test = None +y_test = None cdef extern from "numpy/arrayobject.h": @@ -74,20 +75,18 @@ cdef public void f_idx_list_to_print(float* f_idxs, size_t num): # return retval -cdef public void c_onehot(float* y, float* idxs, size_t n_idx): - oh = nn.onehot(np.asarray(idxs), nc=len(nn.vocab)) - ensure_contiguous(oh) - memcpy(y, PyArray_DATA(oh), oh.size * sizeof(float)) - # eprint(np.argmax(oh, axis=1)) - - -cdef public void c_slices(float* X, float* idxs, size_t bs, size_t win): - X_np = np.asarray(X) +cdef public void cbow_batch( + float* X, float* y, float* idxs, size_t bs, size_t win +): idxs_np = np.asarray(idxs) + # Deal with X + X_np = np.asarray(X) for r in range(bs): X_np[r, :win] = idxs_np[r:r+win] X_np[r, win:] = idxs_np[r+win+1:r+win+1+win] - # eprint(X_np) + + # Deal with y + nn.onehot(np.asarray(y), idxs_np[win:-win]) cdef public void debug_print(object o): @@ -121,26 +120,6 @@ cdef public float eval_net(object net): return net.evaluate(X_test, y_test, verbose=False) -cdef public void mnist_batch(float* X, float* y, size_t bs, - int part, int total): - if total == 0: - X_pool, y_pool = X_train, y_train - else: - partsize = len(X_train) // total - X_pool = X_train[part*partsize:(part+1)*partsize] - y_pool = y_train[part*partsize:(part+1)*partsize] - - idx = np.random.choice(len(X_pool), bs, replace=True) - - X_r = X_pool[idx] - y_r = y_pool[idx] - - assert X_r.flags['C_CONTIGUOUS'] - assert y_r.flags['C_CONTIGUOUS'] - memcpy(X, PyArray_DATA(X_r), X_r.size * sizeof(float)) - memcpy(y, PyArray_DATA(y_r), y_r.size * sizeof(float)) - - cdef public void init_weightlist_like(WeightList* wl, object net): weights = net.get_weights() wl.n_weights = len(weights) @@ -177,6 +156,10 @@ cdef public void combo_weights( wf += alpha * ww +cdef public void create_test_dataset(size_t win): + _create_test_dataset(win) + + cdef list wrap_weight_list(WeightList* wl): weights = [] for i in range(wl.n_weights): @@ -221,3 +204,9 @@ def ensure_contiguous(a): def eprint(*args, **kwargs): return print(*args, flush=True, **kwargs) + + +def _create_test_dataset(win): + global X_test, y_test + if X_test is None or y_test is None: + X_test, y_test = nn.create_test_dataset(win) diff --git a/library.py b/library.py index 25561a8..35f349d 100644 --- a/library.py +++ b/library.py @@ -1,16 +1,9 @@ import os os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' +import numpy as np import tensorflow as tf tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR) # STFU! -# from nltk.corpus import stopwords -# from nltk.tokenize import word_tokenize -from mynet import load_mnist, onehot - - -def word_tokenize(s: str): - l = ''.join(c.lower() if c.isalpha() else ' ' for c in s) - return l.split() HERE = os.path.abspath(os.path.dirname(__file__)) @@ -20,10 +13,38 @@ VOCAB = os.path.join(HERE, 'vocab.txt') vocab = { w: i for i, w in enumerate(open(VOCAB).read().splitlines(keepends=False)) } -# inv_vocab = [vocab[i] for i in range(len(vocab))] inv_vocab = sorted(vocab, key=vocab.get) +def onehot(oh_store, idx): + oh_store[:] = 0 + oh_store[np.arange(len(idx)), idx.astype(np.int)] = 1 + + +def word_tokenize(s: str): + l = ''.join(c.lower() if c.isalpha() else ' ' for c in s) + return l.split() + + +def create_test_dataset(win): + S = 1000 + with open(CORPUS) as f: + ds = np.array([vocab[w] for w in word_tokenize(f.read()) + if w in vocab]) + idx = np.random.choice(np.arange(win, len(ds) - win), S) + oh_store = np.zeros((S, len(vocab)), dtype=np.float32) + onehot(oh_store, ds[idx]) + return ( + # X + np.stack([ + np.concatenate([ds[i-win:i], ds[i+1:i+win+1]]) + for i in idx + ], axis=0).astype(np.float32), + + #y + oh_store + ) + def create_mnist_network(): model = tf.keras.models.Sequential([ tf.keras.layers.Dense(30, input_shape=(784,), activation='relu'), @@ -35,8 +56,8 @@ def create_mnist_network(): def create_cbow_network(win, embed): - ctxt = tf.keras.layers.Input(shape=[win]) - ed = tf.keras.layers.Embedding(len(vocab), embed, input_length=win)(ctxt) + ctxt = tf.keras.layers.Input(shape=[2*win]) + ed = tf.keras.layers.Embedding(len(vocab), embed, input_length=2*win)(ctxt) cbow = tf.keras.layers.Lambda(lambda x: tf.reduce_mean(x, axis=1))(ed) blowup = tf.keras.layers.Dense(len(vocab), activation='softmax')(cbow) mod = tf.keras.Model(inputs=ctxt, outputs=blowup) @@ -44,7 +65,6 @@ def create_cbow_network(win, embed): optimizer='sgd', loss='categorical_crossentropy', ) - print(mod, flush=True) return mod diff --git a/main.c b/main.c index 98e30bb..69cda07 100644 --- a/main.c +++ b/main.c @@ -15,9 +15,9 @@ #define TAG_SWORD 7 #define TAG_IWORD 8 -#define COMM 1 -#define ITER 1 -#define BS 10 +#define COMM 5 +#define ITER 200 +#define BS 32 #define EMB 20 #define WIN 2 #define FSPC 1 @@ -241,6 +241,7 @@ void slave_node() { int me = my_mpi_id(); PyObject* net = create_network(WIN, EMB); + create_test_dataset(WIN); WeightList wl; init_weightlist_like(&wl, net); @@ -264,11 +265,11 @@ void slave_node() { MPI_Recv(f_widx, n_words, MPI_FLOAT, mpi_id_from_role_id(BATCHER, 0), TAG_BATCH, MPI_COMM_WORLD, MPI_STATUS_IGNORE); - c_slices(X, f_widx, BS, WIN); - c_onehot(y, f_widx + WIN, BS); + cbow_batch(X, y, f_widx, BS, WIN); step_net(net, X, y, BS); + INFO_PRINTLN("."); } - // printf("%d net: %f\n", my_mpi_id(), eval_net(net)); + printf("%d net: %f\n", my_mpi_id(), eval_net(net)); update_weightlist(&wl, net); // send_weights(&wl, mpi_id_from_role_id(MASTER, 0), TAG_WEIGH); }