diff --git a/.gitignore b/.gitignore index 349876d..5101566 100644 --- a/.gitignore +++ b/.gitignore @@ -5,5 +5,5 @@ compile_commands.json build/ trained/ __pycache__/ -data +config .dockerignore diff --git a/Dockerfile b/Dockerfile index d018af6..69110e3 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,8 +1,16 @@ from ubuntu:18.04 RUN apt -y update && \ - apt -y install build-essential pkg-config ninja-build python3 python3-pip python3-dev mpich -RUN pip3 install meson numpy tensorflow flask cython + apt -y install build-essential pkg-config ninja-build python3 \ + python3-pip python3-dev mpich && \ + apt clean + +RUN pip3 install --no-cache \ + meson==0.51.2 \ + numpy==1.16.4 \ + tensorflow==1.14.0 \ + Flask==1.0.2 \ + cython==0.29.14 RUN mkdir /workspace COPY bridge.pyx library.py server.py meson.build main.c /workspace/ RUN cd /workspace && meson build && cd build && ninja diff --git a/README.md b/README.md index 7c8a454..cf4ecef 100644 --- a/README.md +++ b/README.md @@ -55,13 +55,20 @@ mpiexec -n NUM_PROC ./build/fedavg_mpi /path/to/training/data/textfile{1,2,3} This program **expects a couple of things**: -First, **in the project root** create a directory `data` and put in there +First, **in the project root** create a directory `config` and put in there the following three files: -- `vocab.txt` -- a whitespace-separated list of words, for which the embeddings - will be learned. The words can only contain lowercase alphabetic ASCII chars -(you can try lowercase UTF-8 and see what happens but no guarantees here). + +- `vocab.txt` -- a whitespace-separated (newlines okay too) list of words, for + which the embeddings will be learned. The words can only contain lowercase +alphabetic ASCII chars (you can try lowercase UTF-8 and see what happens but no +guarantees here). An example: + +``` +a quick brown fox jumped over a lazy dog padword hello world other words +``` + - `test.txt` -- a testing dataset with context windows of size 5, one line per - window. The central (so third) word in the context window will be used as the +window. The central (third) word in the context window will be used as the target and the surrounding words as the source. The same requirements apply here as for the vocabulary, and furthermore only words present in the `vocab.txt` are allowed in `test.txt`. This file will be used to track the loss @@ -72,19 +79,19 @@ the quick brown fox jumped over a lazy dog padword ``` -There also needs to be a file `cfg.json` **in the project root** containing the -following fields: +- `cfg.json` -- a JSON file with the following keys defined. -* `"data"`: `some_name` -- the name of the directory in which you put -`vocab.txt` and `test.txt`; -* `"bpe"`: Number of independent learner SGD iterations per communication - round; -* `"bs"`: batch size (the number of context windows in a batch); -* `"target"`: The float value for the loss that you want to achieve, once the -network reaches this loss it will stop training, save the embeddings and exit. + * `"data_name"`: The name of the dataset (can be whatever you call it) + * `"bpe"`: Number of independent learner SGD iterations per communication + round; + * `"bs"`: batch size (the number of context windows in a batch); + * `"target"`: The float value for the loss that you want to achieve, once + the network reaches this loss it will stop training, save the embeddings + and exit. Then, for each training data file passed as an argument (these can reside -wherever you want them to), an input pipeline will be constructed in the +wherever you want them to, even in the `config` folder along with those three +config files), an input pipeline will be constructed in the program. There are 3 nodes in the input pipeline (tokenizer, filter, batcher). Then there's this rule that one learner isn't allowed to tap more than one pipeline, so each pipeline will need at least one learner. There also needs to @@ -106,3 +113,34 @@ learners = NUM_PROCS - 2 - 3*num_data_files The good thing is, the program will complain if it doesn't like the numbers you passed it and tell you how to fix it. + +The formula for assigning a learner to a pipeline looks like this: + +``` +pipeline_id = learner_id % number_of_pipelines +``` + +This ensures that the learners are assigned to pipelines as uniformly as +possible. + +The program will then create a folder named `trained` in the project root +and will save there the checkpointed model weights as .h5 files and, after the +training is finished, the resulting embedding matrix as whitespace-separated +CSV, with the order of the vectors corresponding to the order of words in +`vocab.txt`. + +### Docker + +Alternatively, you can use Docker: + +``` +docker build -t fedavg-container . +docker run --rm \ + -it \ + -v /host/path/to/config:/workspace/config \ + -v /host/path/to/save/trained:/workspace/trained \ + -v /host/path/to/dataset:/container/path/to/dataset + fedavg-container \ + mpiexec -n NUM_PROC ./build/fedavg_mpi \ + /container/path/to/dataset/train_data{1,2,3,4} +``` diff --git a/library.py b/library.py index 96447ea..4376e84 100644 --- a/library.py +++ b/library.py @@ -1,8 +1,8 @@ import os -import json -from sys import stderr os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' - +import json +import warnings +warnings.simplefilter('ignore', category=FutureWarning) WIN = 2 EMB = 32 @@ -10,11 +10,11 @@ EMB = 32 HERE = os.path.abspath(os.path.dirname(__file__)) -DATA = os.path.join(HERE, 'data') +CONFIG = os.path.join(HERE, 'config') RESULTS = os.path.join(HERE, 'trained') -CORPUS = os.path.join(DATA, 'corpus.txt') -VOCAB = os.path.join(DATA, 'vocab.txt') -TEST = os.path.join(DATA, 'test.txt') +CORPUS = os.path.join(CONFIG, 'corpus.txt') +VOCAB = os.path.join(CONFIG, 'vocab.txt') +TEST = os.path.join(CONFIG, 'test.txt') if not os.path.exists(RESULTS): @@ -22,7 +22,7 @@ if not os.path.exists(RESULTS): def read_cfg(): - with open(os.path.join(DATA, 'cfg.json'), encoding='utf-8') as f: + with open(os.path.join(CONFIG, 'cfg.json'), encoding='utf-8') as f: return json.load(f) @@ -115,8 +115,10 @@ def get_embeddings(net): def save_embeddings(emb): import numpy as np - np.savetxt(os.path.join(RESULTS, f'embeddings_{CFG["name"]}.csv'), emb) + np.savetxt(os.path.join(RESULTS, f'embeddings_{CFG["data_name"]}.csv'), + emb) def ckpt_network(net): - net.save_weights(os.path.join(RESULTS, f'model_ckpt_{CFG["name"]}.h5')) + net.save_weights(os.path.join(RESULTS, + f'model_ckpt_{CFG["data_name"]}.h5'))