for some reason I dockerized it and it works

2019-12-13 17:35:06 -08:00
parent 4a9e328884
commit 79e8b9874b
7 changed files with 158 additions and 32 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -2,8 +2,8 @@
 .DS_Store
 run
 compile_commands.json
 cfg.json
 build/
 trained/
 __pycache__/
-data_*/
+data
 .dockerignore
--- a/9
+++ b/9
@@ -0,0 +1,9 @@
 from ubuntu:18.04
 RUN apt -y update && \
        apt -y install build-essential pkg-config ninja-build python3 python3-pip python3-dev mpich
 RUN pip3 install meson numpy tensorflow flask cython
 RUN mkdir /workspace
 COPY bridge.pyx library.py server.py meson.build main.c /workspace/
 RUN cd /workspace && meson build && cd build && ninja
 WORKDIR /workspace
--- a/README.md
+++ b/README.md
@@ -0,0 +1,108 @@
 # Implementation of Federated Averaging with MPI, Keras and Cython
 (_for educational purposes_)
 ## What's it doing?
 The system implemented in this project learns word embeddings with CBOW
 approach, and furthermore, tries to do it in a distributed fashion. There are
 two flavors of distribution present here:
 1. Reading tokens (words) from a source (a text file for now), filtering and
   looking up vocabulary indices for words, windowing and batching are all
 implemented in separate processes and form an *input pipeline*.
 2. Neural Network training is done in parallel across several nodes
   (*learners*), with the learned weights periodically gathered, averaged and
   distributed by the central node, a.k.a. *dispatcher*. 
 In this framework each learner can have its own input pipeline or all learners
 can tap a single input pipeline or something in between can also work. It's not
 possible in current version for one learner to tap more than one pipeline
 though.
 ## How to make this work
 ### Requirements
 * A recent UNIX-y system
 * A recent GCC (default macOS clang also seems to work)
 * MPICH 3
 * Python 3.6 with dev headers and libraries (e.g. `python3-dev` on Ubuntu)
 * Meson and ninja for building
 * TensorFlow 1.14
 * flask
 * Cython
 ### Compiling
 Compilation is supposed to be as simple as: (run in project root)
 ```sh
 meson build && cd build && ninja
 ```
 If this fails then either fix it yourself or let me know I guess.
 ### Running
 Now this isn't without some quirks (due to this being a course project and
 all). First you have to run *FROM PROJECT ROOT* using the following command
 (don't run it yet as there are more instructions coming):
 ```sh
 mpiexec -n NUM_PROC ./build/fedavg_mpi /path/to/training/data/textfile{1,2,3}
 ```
 This program **expects a couple of things**:
 First, **in the project root** create a directory `data` and put in there
 the following three files:
 - `vocab.txt` -- a whitespace-separated list of words, for which the embeddings
  will be learned. The words can only contain lowercase alphabetic ASCII chars
 (you can try lowercase UTF-8 and see what happens but no guarantees here).
 - `test.txt` -- a testing dataset with context windows of size 5, one line per
  window. The central (so third) word in the context window will be used as the
 target and the surrounding words as the source. The same requirements apply
 here as for the vocabulary, and furthermore only words present in the
 `vocab.txt` are allowed in `test.txt`. This file will be used to track the loss
 of the network during training. An example of the `test.txt` format.
 ```
 the quick brown fox jumped
 over a lazy dog padword
 ```
 There also needs to be a file `cfg.json` **in the project root** containing the
 following fields:
 * `"data"`: `some_name` -- the name of the directory in which you put
 `vocab.txt` and `test.txt`;
 * `"bpe"`: Number of independent learner SGD iterations per communication
  round;
 * `"bs"`: batch size (the number of context windows in a batch);
 * `"target"`: The float value for the loss that you want to achieve, once the
 network reaches this loss it will stop training, save the embeddings and exit.
 Then, for each training data file passed as an argument (these can reside
 wherever you want them to), an input pipeline will be constructed in the
 program. There are 3 nodes in the input pipeline (tokenizer, filter, batcher).
 Then there's this rule that one learner isn't allowed to tap more than one
 pipeline, so each pipeline will need at least one learner. There also needs to
 be a dispatcher process and a visualizer process.
 **TLDR:** The formula for the number of processes that you need to request from
 `mpiexec -n` looks like this:
 ```
 NUM_PROCS >= 4*num_data_files + 2
 ```
 There is also a convenient (well, somewhat) formula to determine how many
 learners you will get depending on the arguments you passed:
 ```
 learners = NUM_PROCS - 2 - 3*num_data_files
 ```
 The good thing is, the program will complain if it doesn't like the numbers you
 passed it and tell you how to fix it. 
--- a/bridge.pyx
+++ b/bridge.pyx
@@ -47,11 +47,6 @@ cdef public void serve():
    srv.serve()
 cdef public void bump_count():
    eprint(f'bumping count from {srv.counter} to {srv.counter + 1}')
    srv.counter += 1
 cdef public size_t getwin():
    return nn.WIN
@@ -72,10 +67,6 @@ cdef public float gettarget():
    return nn.CFG['target']
 cdef public float getflpc():
    return nn.CFG['flpc']
 cdef public int get_tokens(WordList* wl, const char *filename):
    fnu = filename.decode('utf-8')
    if fnu not in tokenizers:
--- a/library.py
+++ b/library.py
@@ -1,9 +1,8 @@
 import os
 import json
 from sys import stderr
 os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
 from mynet import onehot
 WIN = 2
 EMB = 32
@@ -11,21 +10,27 @@ EMB = 32
 HERE = os.path.abspath(os.path.dirname(__file__))
-def read_cfg():
+DATA = os.path.join(HERE, 'data')
    with open(os.path.join(HERE, 'cfg.json')) as f:
        return json.load(f)
 CFG = read_cfg()
 DATA = os.path.join(HERE, CFG['data'])
 RESULTS = os.path.join(HERE, 'trained')
 CORPUS = os.path.join(DATA, 'corpus.txt')
 VOCAB = os.path.join(DATA, 'vocab.txt')
 TEST = os.path.join(DATA, 'test.txt')
 if not os.path.exists(RESULTS):
    os.mkdir(RESULTS)
 def read_cfg():
    with open(os.path.join(DATA, 'cfg.json'), encoding='utf-8') as f:
        return json.load(f)
 CFG = read_cfg()
 def read_vocab_list():
-    with open(VOCAB) as f:
+    with open(VOCAB, encoding='utf-8') as f:
        return f.read().split()
@@ -41,6 +46,13 @@ def word_tokenize(s: str):
    return l.split()
 def onehot(a, nc=10):
    import numpy as np
    oh = np.zeros((len(a), nc), dtype=np.float32)
    oh[np.arange(len(a)), a.flatten().astype(np.int)] = 1
    return oh
 def create_test_dataset():
    import numpy as np
    test_dataset = np.vectorize(vocab.get)(np.genfromtxt(TEST, dtype=str))
@@ -89,7 +101,7 @@ def eval_network(net):
 def token_generator(filename):
-    with open(filename) as f:
+    with open(filename, encoding='utf-8') as f:
        for l in f:
            if not l.isspace():
                tok = word_tokenize(l)
@@ -103,8 +115,8 @@ def get_embeddings(net):
 def save_embeddings(emb):
    import numpy as np
-    np.savetxt(os.path.join(RESULTS, f'embeddings_{CFG["data"]}.csv'), emb)
+    np.savetxt(os.path.join(RESULTS, f'embeddings_{CFG["name"]}.csv'), emb)
 def ckpt_network(net):
-    net.save_weights(os.path.join(RESULTS, f'model_ckpt_{CFG["data"]}.h5'))
+    net.save_weights(os.path.join(RESULTS, f'model_ckpt_{CFG["name"]}.h5'))
--- a/main.c
+++ b/main.c
@@ -368,14 +368,13 @@ void dispatcher() {
    size_t bs = getbs();
    size_t bpe = getbpe();
    float target = gettarget();
    float flpc = getflpc();
    PyObject* frank = create_network();
    WeightList wl;
    init_weightlist_like(&wl, frank);
    update_weightlist(&wl, frank);
-    int lpr = number_of(LEARNER) * flpc;  // Learners per round
+    int lpr = number_of(LEARNER);
    WeightList *wls = malloc(sizeof(WeightList) * lpr);
    for in_range(i, lpr) {
        init_weightlist_like(wls + i, frank);
@@ -445,10 +444,6 @@ void dispatcher() {
 void visualizer() {
    INFO_PRINTF("Starting visualizer %d\n", getpid());
    serve();
    while (1) {
        sleep(1);
        bump_count();
    }
 }
 int main (int argc, const char **argv) {
--- a/meson.build
+++ b/meson.build
@@ -1,10 +1,21 @@
 project('fedavg_mpi', 'c')
-add_global_arguments('-Wno-unused-command-line-argument', language: 'c')
+
 add_project_arguments(
  '-DNPY_NO_DEPRECATED_API=NPY_1_7_API_VERSION',
  language: 'c'
 )
 compiler = meson.get_compiler('c')
 if compiler.has_argument('-Wno-unused-command-line-argument')
  add_global_arguments('-Wno-unused-command-line-argument', language: 'c')
 endif
 if compiler.has_link_argument('-Wl,-w')
  add_link_args = ['-Wl,-w']
 else
  add_link_args = []
 endif
 mpi = dependency('mpi')
 python = dependency('python3')
 numpy_header = include_directories(run_command(
@@ -21,4 +32,4 @@ executable('fedavg_mpi',
  'main.c', bridge,
  dependencies: [mpi, python],
  include_directories: numpy_header,
-  link_args: '-Wl,-w')
+  link_args: add_link_args)