diff --git a/.gitignore b/.gitignore index 0e41c6f..5772e8c 100644 --- a/.gitignore +++ b/.gitignore @@ -2,8 +2,7 @@ .DS_Store run compile_commands.json -*.txt +cfg.json build/ -cythoned/ __pycache__/ data_*/ diff --git a/bridge.pyx b/bridge.pyx index 09a9120..ebaacab 100644 --- a/bridge.pyx +++ b/bridge.pyx @@ -11,8 +11,6 @@ import flask tokenizers = {} -X_test = None -y_test = None cdef extern from "numpy/arrayobject.h": @@ -49,6 +47,30 @@ cdef public void serve(): nn.app.run(port=8448) +cdef public size_t getwin(): + return nn.WIN + + +cdef public size_t getemb(): + return nn.EMB + + +cdef public size_t getbs(): + return nn.CFG['bs'] + + +cdef public size_t getbpe(): + return nn.CFG['bpe'] + + +cdef public float gettarget(): + return nn.CFG['target'] + + +cdef public float getflpc(): + return nn.CFG['flpc'] + + cdef public int get_tokens(WordList* wl, const char *filename): fnu = filename.decode('utf-8') if fnu not in tokenizers: @@ -82,10 +104,8 @@ cdef public void _dbg_print(object o): eprint(o) -cdef public void _dbg_print_cbow_batch( - object net, float* batch, size_t bs -): - X_np, y_np = cbow_batch(net, batch, bs) +cdef public void _dbg_print_cbow_batch(float* batch, size_t bs): + X_np, y_np = cbow_batch(batch, bs) eprint(X_np) eprint(y_np) @@ -95,9 +115,9 @@ cdef public void randidx(int* idx, size_t l, size_t how_much): memcpy(idx, PyArray_DATA(i_np), how_much * sizeof(int)) -cdef public object create_network(int win, int embed): +cdef public object create_network(): try: - net = nn.create_cbow_network(win, embed) + net = nn.create_cbow_network() eprint(net) return net except Exception as e: @@ -111,7 +131,7 @@ cdef public void set_net_weights(object net, WeightList* wl): cdef public void step_net( object net, float* batch, size_t bs ): - X_train, y_train = cbow_batch(net, batch, bs) + X_train, y_train = cbow_batch(batch, bs) net.train_on_batch(X_train, y_train) @@ -120,10 +140,7 @@ cdef public size_t out_size(object net): cdef public float eval_net(object net): - try: - return net.evaluate(X_test, y_test, verbose=False) - except Exception as e: - eprint(e) + return nn.eval_network(net) cdef public void init_weightlist_like(WeightList* wl, object net): @@ -162,14 +179,8 @@ cdef public void combo_weights( wf += alpha * ww -cdef public void create_test_dataset(size_t win): - _create_test_dataset(win) - - -cdef tuple cbow_batch( - object net, float* batch, size_t bs -): - win = net.input_shape[1] // 2 +cdef tuple cbow_batch(float* batch, size_t bs): + win = nn.WIN batch_np = np.asarray(batch) X_np = batch_np[:, [*range(win), *range(win+1, win+win+1)]] y_np = nn.onehot(batch_np[:, win], nc=len(nn.vocab)) @@ -177,6 +188,7 @@ cdef tuple cbow_batch( cdef list wrap_weight_list(WeightList* wl): + """Thinly wraps a WeightList struct into a NumPy array.""" weights = [] for i in range(wl.n_weights): w_shape = wl.weights[i].shape @@ -220,9 +232,3 @@ def ensure_contiguous(a): def eprint(*args, **kwargs): return print(*args, flush=True, **kwargs) - - -def _create_test_dataset(win): - global X_test, y_test - if X_test is None or y_test is None: - X_test, y_test = nn.create_test_dataset(win) diff --git a/library.py b/library.py index f0f5b32..0c6033d 100644 --- a/library.py +++ b/library.py @@ -1,19 +1,38 @@ import os +import json os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' from mynet import onehot +WIN = 2 +EMB = 32 + HERE = os.path.abspath(os.path.dirname(__file__)) -DATA = os.path.join(HERE, 'data') + + +def read_cfg(): + with open(os.path.join(HERE, 'cfg.json')) as f: + return json.load(f) + + +CFG = read_cfg() +DATA = os.path.join(HERE, CFG['data']) CORPUS = os.path.join(DATA, 'corpus.txt') VOCAB = os.path.join(DATA, 'vocab.txt') TEST = os.path.join(DATA, 'test.txt') -vocab = { - w: i for i, w in enumerate(open(VOCAB).read().splitlines(keepends=False)) -} -inv_vocab = sorted(vocab, key=vocab.get) + +def read_vocab_list(): + with open(VOCAB) as f: + return f.read().split() + + +inv_vocab = read_vocab_list() +vocab = {w: i for i, w in enumerate(inv_vocab)} + +X_test = None +y_test = None def word_tokenize(s: str): @@ -21,13 +40,14 @@ def word_tokenize(s: str): return l.split() -def create_test_dataset(win): +def create_test_dataset(): import numpy as np test_dataset = np.vectorize(vocab.get)(np.genfromtxt(TEST, dtype=str)) - assert test_dataset.shape[1] == 2*win + 1 - X_test = test_dataset[:, [*range(0, win), *range(win+1, win+win+1)]] - y_test = onehot(test_dataset[:, win], nc=len(vocab)) - return X_test, y_test + assert test_dataset.shape[1] == 2*WIN + 1 + + global X_test, y_test + X_test = test_dataset[:, [*range(0, WIN), *range(WIN+1, WIN+WIN+1)]] + y_test = onehot(test_dataset[:, WIN], nc=len(vocab)) def create_mnist_network(): @@ -44,13 +64,13 @@ def create_mnist_network(): return model -def create_cbow_network(win, embed): +def create_cbow_network(): import tensorflow as tf tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR) # STFU! tf.random.set_random_seed(42) - ctxt = tf.keras.layers.Input(shape=[2*win]) - ed = tf.keras.layers.Embedding(len(vocab), embed, input_length=2*win)(ctxt) + ctxt = tf.keras.layers.Input(shape=[2*WIN]) + ed = tf.keras.layers.Embedding(len(vocab), EMB, input_length=2*WIN)(ctxt) cbow = tf.keras.layers.Lambda(lambda x: tf.reduce_mean(x, axis=1))(ed) blowup = tf.keras.layers.Dense(len(vocab), activation='softmax')(cbow) mod = tf.keras.Model(inputs=ctxt, outputs=blowup) @@ -61,9 +81,15 @@ def create_cbow_network(win, embed): return mod +def eval_network(net): + if X_test is None or y_test is None: + create_test_dataset() + return net.evaluate(X_test, y_test, verbose=False) + + def token_generator(filename): with open(filename) as f: - for i, l in enumerate(f.readlines()): + for l in f: if not l.isspace(): tok = word_tokenize(l) if tok: diff --git a/main.c b/main.c index 513bd59..498bcac 100644 --- a/main.c +++ b/main.c @@ -17,14 +17,6 @@ #define TAG_INSTR 9 #define TAG_TERMT 10 -#define COMM 50 -#define ITER 250 -#define TARGET 8.40 -#define BS 32 -#define EMB 32 -#define WIN 2 -#define FLPC 1 - #define in_range(i, x) (size_t i = 0; i < (x); i++) // I am honestly VERY sorry for this // but the power of macros corrupts even the best of us @@ -218,7 +210,7 @@ void filterer() { int batcher = mpi_id_from_role_id(BATCHER, rid); Word w = {0, NULL}; - const size_t window_size = 2 * WIN + 1; + const size_t window_size = 2 * getwin() + 1; long* window = malloc(window_size * sizeof(long)); size_t have = 0; @@ -248,15 +240,16 @@ void batcher() { INFO_PRINTF("Starting batcher %d\n", getpid()); int rid = my_role_id(BATCHER); int tokenizer = mpi_id_from_role_id(FILTERER, rid); + int bs = getbs(); int learner_mpi_id = 0; - const size_t window_size = 2 * WIN + 1; - const size_t bufsize = BS * window_size; + const size_t window_size = 2 * getwin() + 1; + const size_t bufsize = bs * window_size; float* batch = malloc(bufsize * sizeof(float)); long* l_wid = malloc(window_size * sizeof(long)); while (1) { - for in_range(r, BS) { + for in_range(r, bs) { recv_window(l_wid, window_size, tokenizer); if (l_wid[0] == -1) break; @@ -327,13 +320,15 @@ void learner() { int dispatcher = mpi_id_from_role_id(DISPATCHER, 0); INFO_PRINTF("Learner %d (pid %d) is assigned to pipeline %d\n", rid, getpid(), my_batcher_rid); + size_t bs = getbs(); + size_t bpe = getbpe(); - PyObject* net = create_network(WIN, EMB); + PyObject* net = create_network(); WeightList wl; init_weightlist_like(&wl, net); - size_t window_size = (2*WIN + 1); - size_t bufsize = BS * window_size; + size_t window_size = 2 * getwin() + 1; + size_t bufsize = bs * window_size; float* batch = malloc(bufsize * sizeof(float)); int go; @@ -343,11 +338,11 @@ void learner() { while (go != -1) { recv_weights(&wl, dispatcher); set_net_weights(net, &wl); - for in_range(k, ITER) { + for in_range(k, bpe) { MPI_Send(&me, 1, MPI_INT, batcher, TAG_READY, MPI_COMM_WORLD); MPI_Recv(batch, bufsize, MPI_FLOAT, batcher, TAG_BATCH, MPI_COMM_WORLD, MPI_STATUS_IGNORE); - step_net(net, batch, BS); + step_net(net, batch, bs); } update_weightlist(&wl, net); send_weights(&wl, dispatcher); @@ -364,14 +359,17 @@ void learner() { void dispatcher() { INFO_PRINTF("Starting dispatcher %d\n", getpid()); int go = 1; + size_t bs = getbs(); + size_t bpe = getbpe(); + float target = gettarget(); + float flpc = getflpc(); - PyObject* frank = create_network(WIN, EMB); - create_test_dataset(WIN); + PyObject* frank = create_network(); WeightList wl; init_weightlist_like(&wl, frank); update_weightlist(&wl, frank); - int lpr = number_of(LEARNER) * FLPC; // Learners per round + int lpr = number_of(LEARNER) * flpc; // Learners per round WeightList *wls = malloc(sizeof(WeightList) * lpr); for in_range(i, lpr) { init_weightlist_like(wls + i, frank); @@ -383,7 +381,7 @@ void dispatcher() { float min_loss = crt_loss; time_t start = time(NULL); size_t rounds = 0; - while (crt_loss > TARGET) { + while (crt_loss > target) { randidx(round, number_of(LEARNER), lpr); for in_range(k, lpr) { // Instruct learners to learn @@ -418,12 +416,12 @@ void dispatcher() { float delta_l = first_loss - crt_loss; INFO_PRINTF( "Laptop MPI adam consecutive_batch " - "W%d E%d BS%d bpe%d LPR%d pp%lu," + "W%lu E%lu BS%lu bpe%lu LPR%d pp%lu," "%f,%f,%f,%f," "%lu,%.0f,%lu\n", - WIN, EMB, BS, ITER, lpr, number_of(TOKENIZER), - delta_l/rounds, delta_l/delta_t, min_loss, TARGET, - rounds, delta_t,BS*ITER*rounds + getwin(), getemb(), bs, bpe, lpr, number_of(TOKENIZER), + delta_l/rounds, delta_l/delta_t, min_loss, target, + rounds, delta_t,bs*bpe*rounds ); Py_DECREF(frank); free_weightlist(&wl);