ya sdelal'

2019-12-15 19:51:30 -08:00
parent 05480606b0
commit 670f69e0df
4 changed files with 404 additions and 113 deletions
--- a/docs/fig/modes.drawio
+++ b/docs/fig/modes.drawio
@@ -1 +1 @@
-<mxfile modified="2019-12-15T17:12:53.280Z" host="" agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) draw.io/11.1.1 Chrome/76.0.3809.88 Electron/6.0.0 Safari/537.36" etag="wUOw0lOGHGc77jKsLJXB" version="11.1.1" type="device"><diagram id="HbwVHqc1XiIdVxvrsfh9" name="Page-1">7Vxbc5s8EP01fuw3NiCwH/sl6WUmnbaT6fTy0lFBMbQYMUKu7f76CiMZg4ghdqyLm5cMWi8Yds/Rrg6KR+7VYv2awDx+hyOUjpxxtB651yPHmXk++1saNpUhcILKMCdJVJkmteEu+YO4ccytyyRCRcORYpzSJG8aQ5xlKKQNGyQEr5pu9zhtfmsO50gy3IUwla2fk4jGlXUKxrX9DUrmsfjmyZh/soDCmRuKGEZ4tWdyb0buFcGYVkeL9RVKy9iJuFTnvXrg092NEZTRISf8oquP928/pd+SYv0eBS++f3u3eSEuU9CNeGIUsQDwISY0xnOcwfSmtv5P8DKLUHnZMRvVPrcY58w4YcafiNINzyZcUsxMMV2k/FO0TuiXveOv5aX+cwAfXq/5pbeDjRhklGy+7A+q04AY1qdtR+K86gHLp2oErsBLEqJDoQEcbpDMET3kGMipmOwSzIiB8AKxO2IuBKWQJr+bdwI5ROc7P37qS0LgZs8hx0lGi70rfygNzIGTbUcazrVJ0EJEj/+s4c4OqhsQo70nqU1bkD0GcI45gLMUb24H9fXgzWnhbdqDt7Y/UAE49xlwp05wswsBnJjwzgs4zxzABZYiznVNQVy7pM4eV1KdQAXieFh/w3TJ4/A2y5flbX1IcpQmGZIg2QTcKk4ousvhNnsr1sg3wcUvjwhF6we7zwfyIxYCrTj6fLyqm+pd6xzvNdTe+OGM7kX1iKD55tD0OJY6fTQtGM/oy3IRxAxhCosiCYX5VZI2s3sEm4OhbPYNYbPrHmp4e90nSsgcSGS+RZBkiGjn8I6MYi0vc9hXSuGp7RTWzeCZdQw+3NH1+rc6xjNReGYNhUU/o43DYop95vCRHBZqgL0cbskAZnBYhNUGDru6OWy9phfoJrFrHYndgwvd/l5ahdgswmoBiR2gm8QGyVZmqVbuYJ0UPD05T8upLAxJSS5imJeH4SZNWC5JP/p/VEm//bEzwPDXfAuF90u6VZuejib9spGnlCW+FNHrpMghDWMDJhUX9IcrUBouWcz4vP3ewjIcSpF1NEcWgEufrhstV4bLbDb6rQNTupTXAbO85wyc5T1P55QubtNkrR+0er0OnVCt1u9Z/w5Yt9bveQPZAUx5cxcETRD2aP1tdyVav4iqgesTf2qW1u/9W/XuDAz2rWOw31oi92j9bX8lOqEnLwhMpbB2rd8Lnjl8GoentnO4T+vXw+GpPRzWrfV7M9s5rFvrF0qCRSRu9cZ9Wr/US6vQ+kVYLSCxdq0fGPRfH2Zp/WCwCuQ8PTlPy6ksDElJNltj9fzeJadSrR/Ibw8N0vqDATtq1SrSsphhp9YvRVa31u//W9rH+bV+MFQLqVYG2qZ0WVswXuvvWqCoFfvBxasMRzc3g+WDM+wyOi2n8grdsqLSbm66eKK0u/Gt//8X04rK0M30lWSsjUry5nTzi0qHcq22qAj2Xi5bji0q/tD95wJ4xhQVcecXVFQ6eKK2qFx876W4qPhD9ahKUtRGJVl8Mr6odKmwiouK9duSzlZUhm43EsAzp6jIKpjlRaWLJ2qLyoDF36MmkwgW8da3HNyzefsKp5jU8/kTBLH1MxBd29adjhieL4jBgGbHtCB6QF0U2bD+KbbqnWT9e3buzV8=</diagram></mxfile>
+<mxfile modified="2019-12-15T18:24:15.861Z" host="" agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) draw.io/11.1.1 Chrome/76.0.3809.88 Electron/6.0.0 Safari/537.36" etag="JZI0mDDl2882QxsvCQuo" version="11.1.1" type="device"><diagram id="HbwVHqc1XiIdVxvrsfh9" name="Page-1">7Vxbc5s8EP01fuw3NiCwH/sl6WUmnbaT6fTy0lFBMbQYMUKu7f76CiMZg4ghdqyLm5cMWi8Yds/Rrg6KR+7VYv2awDx+hyOUjpxxtB651yPHmXk++1saNpUhcILKMCdJVJkmteEu+YO4ccytyyRCRcORYpzSJG8aQ5xlKKQNGyQEr5pu9zhtfmsO50gy3IUwla2fk4jGlXUKxrX9DUrmsfjmyZh/soDCmRuKGEZ4tWdyb0buFcGYVkeL9RVKy9iJuFTnvXrg092NEZTRISf8oquP928/pd+SYv0eBS++f3u3eSEuU9CNeGIUsQDwISY0xnOcwfSmtv5P8DKLUHnZMRvVPrcY58w4YcafiNINzyZcUsxMMV2k/FO0TuiXveOv5aX+cwAfXq/5pbeDjRhklGy+7A+q04AY1qdtR+K86gHLp2oErsBLEqJDoQEcbpDMET3kGMipmOwSzIiB8AKxO2IuBKWQJr+bdwI5ROc7P37qS0LgZs8hx0lGi70rfygNzIGTbUcazrVJ0EJEj/+s4c4OqhsQo70nqU1bkD0GcI45gLMUb24H9fXgzWnhbdqDt7Y/UAE49xlwp05wswsBnJjwzgs4zxzABZYiznVNQVy7pM4eV1KdQAXieFh/w3TJ4/A2y5flbX1IcpQmGZIg2QTcKk4ousvhNnsr1sg3wcUvjwhF6we7zwfyIxYCrTj6fLyqm+pd6xzvNdTe+OGM7kX1iKD55tD0OJY6fTQtGM/oy3IRxAxhCosiCYX5VZI2s3sEm4OhbPYNYbPrHmp4e90nSsgcSGS+RZBkiGjn8I6MYi0vc9hXSuGp7RTWzeCZdQw+3NH1+rc6xjNReGYNhUU/o43DYop95vCRHBZqgL0cbskAZnBYhNUGDru6OWy9phfoJrFrHYndgwvd/l5ahdgswmoBiR2gm8QGyVZmqVbuYJ0UPD05T8upLAxJSS5imJeH4SZNWC5JP/p/VEm//bEzwPDXfAuF90u6VZuejib9spGnlCW+FNHrpMghDWMDJhUX9IcrUBouWcz4vP3ewjIcSpF1NEcWgEufrhstV4bLbDb6rQNTupTXAbO85wyc5T1P55QubtNkrR+0er0OnVCt1u9Z/w5Yt9bveQPZAUx5cxcETRD2aP1tdyVav4iqgesTf2qW1u/9W/XuDAz2rWOw31oi92j9bX8lOqEnLwhMpbB2rd8Lnjl8GoentnO4T+vXw+GpPRzWrfV7M9s5rFvrF0qCRSRu9cZ9Wr/US6vQ+kVYLSCxdq0fGPRfH2Zp/WCwCuQ8PTlPy6ksDElJNltj9fzeJadSrR/Ibw8N0vqDATtq1SrSsphhp9YvRVa31u//W9rH+bV+MFQLqVYG2qZ0WVswXuvvWqCoFfvBxasMRzc3g+WDM+wyOi2n8grdsqLSbm66eKK0u/Gt//8X04rK0M30lWSsjUry5nTzi0qHcq22qAj2Xi5bji0q/tD95wJ4xhQVcecXVFQ6eKK2qFx876W4qPhD9ahKUtRGJVl8Mr6odKmwiouK9duSzlZUhm43EsAzp6jIKpjlRaWLJ2qLyoDF36MmkwgW8da3HNyzefsKp5jU8/kTBLH1MxBd29adjhieL4jBgGbHtCB6QF0U2bD+KbbqnWT9e3buzV8=</diagram></mxfile>
--- a/docs/generate_plots.py
+++ b/docs/generate_plots.py
@@ -0,0 +1,123 @@
 import os
 import re
 from math import floor, ceil
 import matplotlib.pyplot as plt
 import numpy as np
 HERE = os.path.abspath(os.path.dirname(__file__))
 LOGS = os.path.join(HERE, '../../docs/logs/')
 datasets = {
    'moby': {
        'idx': 0,
        'name': 'Moby Dick (~200k words)',
        'target': 8.4,
        'lim': (16000, 320000)
    },
    'wiki': {
        'name': 'English Wikipedia (~90M words)',
        'idx': 1,
        'target': 8.3,
        'lim': (16000, 360000)
    }
 }
 def s(n):
    return 's' if n > 1 else ''
 def idx_of(l, cond=lambda x: x):
    try:
        return next(i for i, e in enumerate(l) if cond(e))
    except StopIteration:
        return -1
 def meta_from_fn(fn):
    m = re.search(r'(.+)_(\d+)_learner_(\d+)_pp', fn)
    return (lambda x: (x[0], int(x[1]), int(x[2])))(
        m.group(1,2,3)
    )
 if __name__ == '__main__':
    files = sorted(os.listdir(LOGS), key= lambda x: meta_from_fn(x)[1])
    fig = plt.figure(figsize=(10, 4))
    fig.subplots_adjust(left=0.06, right=0.99, top=0.91, wspace=0.18)
    axs = fig.subplots(1, len(datasets))
    pp_speedup = []
    l_speedup = []
    for fn in files:
        name, learners, pipelines = meta_from_fn(fn)
        if learners == 16:
            continue
        with open(os.path.join(LOGS, fn)) as f:
            lines = f.readlines()
        matches = [re.search(r'windows (\d+) validation loss (\d+\.\d+)', l)
                   for l in lines]
        matches = [m for m in matches if m is not None]
        win_loss = [
            (lambda x: (int(x[0]), float(x[1])))(m.group(1, 2)) for m in matches
        ]
        windows, loss = zip(*win_loss)
        axs[datasets[name]['idx']].plot(
            windows[1:], loss[1:], linestyle='-' * (1 + (pipelines>1)),
            color=f'C{learners // 2}',
            label=f'{learners} Learner{s(learners)},'
            f' {pipelines} Pipeline{s(pipelines)}'
        )
        ttt = windows[idx_of(loss, lambda l: l < datasets[name]['target'])]
        if name == 'wiki':
            if pipelines > 1 or learners == 1:
                pp_speedup.append((pipelines, ttt))
            if pipelines == 1:
                l_speedup.append((learners, ttt))
    for d in datasets.values():
        a = axs[d['idx']]
        a.set_xlabel('Context Windows per Learner')
        a.set_ylabel('Validation Loss')
        a.set_xticks([windows[1]] + [*range(0, 300001, 100000)])
        a.set_xlim(*d['lim'])
        a.set_title(d['name'])
        a.legend()
        a.axhline(d['target'], color='k', linestyle=':')
    fig.savefig(os.path.join(HERE, 'fig/datasets.pdf'))
    def speedup_plot(zipped):
        factors, time = zip(*sorted(zipped))
        time = np.asarray(time)
        speedup = time[0] / time
        print(factors, time)
        plt.plot(factors, speedup)
        plt.xlim(min(factors), max(factors))
        plt.ylim(min(speedup), max(speedup))
        plt.xticks([*range(min(factors), max(factors) + 1)])
        plt.yticks([*range(floor(min(speedup)), ceil(max(speedup)) + 1)])
        plt.grid()
    fig = plt.figure(figsize=(10, 4))
    fig.subplots_adjust(left=0.06, right=0.99, top=0.91, wspace=0.18)
    plt.subplot(121)
    speedup_plot(l_speedup)
    plt.title('Single Pipeline')
    plt.xlabel('Number of Learners')
    plt.ylabel(f'Speedup to Target {datasets["wiki"]["target"]}')
    plt.subplot(122)
    speedup_plot(pp_speedup)
    plt.title('Multiple Pipelines')
    plt.xlabel('Number of Pipelines')
    plt.ylabel(f'Speedup to Target {datasets["wiki"]["target"]}')
    plt.savefig(os.path.join(HERE, 'fig/speedups.pdf'))
    plt.show()
--- a/docs/references.bib
+++ b/docs/references.bib
@@ -0,0 +1,147 @@
@misc{mpich,
    title={{MPICH | High-Performance Portable MPI}},
    url={https://www.mpich.org/}
 }
@inproceedings{nltk,
 author = {Loper, Edward and Bird, Steven},
 title = {{NLTK}: The Natural Language Toolkit},
 booktitle = {Proceedings of the ACL-02 Workshop on Effective Tools and
     Methodologies for Teaching Natural Language Processing and Computational
         Linguistics --- Volume 1},
 series = {ETMTNLP '02},
 year = {2002},
 pages = {63--70},
 url = {https://www.nltk.org},
 }
@inproceedings{cbow-skip-gram,
  title={Distributed representations of words and phrases and their
      compositionality},
  author={Mikolov, Tomas and Sutskever, Ilya and Chen, Kai and Corrado, Greg S
      and Dean, Jeff},
  booktitle={Advances in neural information processing systems},
  pages={3111--3119},
  year={2013}
 }
@article{fedavg,
  author    = {H. Brendan McMahan and
               Eider Moore and
               Daniel Ramage and
               Blaise Ag{\"{u}}era y Arcas},
  title     = {Federated Learning of Deep Networks using Model Averaging},
  journal   = {CoRR},
  volume    = {abs/1602.05629},
  year      = {2016},
 }
@misc{numpy,
    title = {{NumPy}},
    url = {https://numpy.org/}
 }
@misc{meson,
    title = {{The Meson Build system}},
    url = {https://mesonbuild.com/}
 }
@misc{gutenberg,
    title = {{Project Gutenberg}},
    url = {https://www.gutenberg.org/}
 }
@misc{wikidump,
    title = {{Wikimedia Downloads}},
    url={https://dumps.wikimedia.org/}
 }
@misc{wikiextractor,
    title = {{WikiExtractor}},
    url = {https://github.com/attardi/wikiextractor}
 }
@misc{10k-words,
    url = {https://github.com/first20hours/google-10000-english}
 }
@article{syngrad,
  author    = {Max Jaderberg and
               Wojciech Marian Czarnecki and
               Simon Osindero and
               Oriol Vinyals and
               Alex Graves and
               Koray Kavukcuoglu},
  title     = {Decoupled Neural Interfaces using Synthetic Gradients},
  journal   = {CoRR},
  volume    = {abs/1608.05343},
  year      = {2016},
 }
@inproceedings{fg-mpi,
    author = {Kamal, Humaira and Wagner, Alan},
    Booktitle = {11th IEEE Intl. Workshop on Parallel and Distributed Scientific and Engineering Computing (PDSEC) held in conjunction with IPDPS-24},
    month = {April},
    pages = {1--8},
    title = {{FG-MPI}: Fine-grain {MPI} for Multicore and Clusters},
    year = {2010},
 }
@misc{tensorflow,
    title={{TensorFlow}: Large-Scale Machine Learning on Heterogeneous Systems},
    url={http://tensorflow.org/},
    author={
        Mart\'{\i}n~Abadi and
            Ashish~Agarwal and
            Paul~Barham and
            Eugene~Brevdo and
            Zhifeng~Chen and
            Craig~Citro and
            Greg~S.~Corrado and
            Andy~Davis and
            Jeffrey~Dean and
            Matthieu~Devin and
            Sanjay~Ghemawat and
            Ian~Goodfellow and
            Andrew~Harp and
            Geoffrey~Irving and
            Michael~Isard and
            Yangqing Jia and
            Rafal~Jozefowicz and
            Lukasz~Kaiser and
            Manjunath~Kudlur and
            Josh~Levenberg and
            Dan~Man\'{e} and
            Rajat~Monga and
            Sherry~Moore and
            Derek~Murray and
            Chris~Olah and
            Mike~Schuster and
            Jonathon~Shlens and
            Benoit~Steiner and
            Ilya~Sutskever and
            Kunal~Talwar and
            Paul~Tucker and
            Vincent~Vanhoucke and
            Vijay~Vasudevan and
            Fernanda~Vi\'{e}gas and
            Oriol~Vinyals and
            Pete~Warden and
            Martin~Wattenberg and
            Martin~Wicke and
            Yuan~Yu and
            Xiaoqiang~Zheng},
    year={2015},
 }
@article{cython,
    url = {https://cython.org},
    year = 2011,
    month = {mar},
    volume = {13},
    number = {2},
    pages = {31--39},
    author = {Stefan Behnel and Robert Bradshaw and Craig Citro and Lisandro Dalcin and Dag Sverre Seljebotn and Kurt Smith},
    title = {Cython: The Best of Both Worlds},
    journal = {Computing in Science {\&} Engineering}
 }
--- a/docs/report.latex
+++ b/docs/report.latex
@@ -5,6 +5,8 @@
 \usepackage{listings}
 \lstset{basicstyle=\ttfamily}
 \renewcommand{\floatpagefraction}{.8}
 \title{Distributed Natural Language Processing with MPI and Python}
 \author{Pavel Lutskov for CPSC 521 @ UBC}
 \begin{document}
@@ -51,7 +53,7 @@ module, which is used internally by NLTK, causes various conflicts when
 incorporating the Python interpreter into a C application. For this reason,
 NLTK had to be abandoned, and the focus of the project was shifted towards the
 distributed Deep Learning-based computation of word embeddings with the help of
-TensorFlow framework.
+TensorFlow~\cite{tensorflow} framework.
 \section{Architecture Overview}
@@ -93,8 +95,9 @@ window is filled it is sent down the pipeline for training batch assembly. In
 the system implemented in this project a context window of size 5 is used.
 In the final stage of the input pipeline, the node called \textit{Batcher}
-accumulates the context windows into batches, which can then be requested by
+accumulates the context windows into batches, which can then be requested by a
-a node containing the neural network for the actual neural network training.
+\textit{Learner} node containing the neural network for the actual neural
 network training.
 The other dimension of the parallelism employed in this system is the
 distributed neural network training. In this project, an approach
@@ -123,7 +126,7 @@ more than one input pipeline.
 \begin{figure}[h]
  \centering
  \includegraphics[width=\linewidth]{fig/modes.pdf}
-  \caption{Two Configurable Modes of System Operation}
+  \caption{Possible Pipeline Configurations}
  \label{fig:modes}
 \end{figure}
@@ -146,17 +149,17 @@ Finally, the file \verb|bridge.pyx| provides interface functions for the C code
 to access the Python functionality, thus creating a bridge between the
 algorithms and the system aspects. In a \verb|.pyx| file, C and Python code can
 be mixed rather freely, with occasional use of some special syntax. This file
-is translated by the Cython framework into \verb|bridge.c| and \verb|bridge.h|
+is translated by the Cython~\cite{cython} framework into \verb|bridge.c| and
-files. The \verb|bridge.c| is then used as a compilation unit for the final
+\verb|bridge.h| files. The \verb|bridge.c| is then used as a compilation unit
-executable, and the \verb|bridge.h| is included into the \verb|main.c| as a
+for the final executable, and the \verb|bridge.h| is included into the
-header file. In order for the compilation to succeed, the compiler needs to be
+\verb|main.c| as a header file. In order for the compilation to succeed, the
-pointed towards the Python header files, and, since NumPy code is used in
+compiler needs to be pointed towards the Python header files, and, since NumPy
-\verb|bridge.pyx|, to the NumPy header files. Furthermore, the application
+code is used in \verb|bridge.pyx|, to the NumPy header files. Furthermore, the
-needs to be linked against the Python dynamic libraries, which results in the
+application needs to be linked against the Python dynamic libraries, which
-Python interpreter being embedded into the final executable. In order to
+results in the Python interpreter being embedded into the final executable. In
-simplify the compilation process and to make the codebase more portable, the
+order to simplify the compilation process and to make the codebase more
-build system Meson~\cite{meson} was used in this project to facilitate
+portable, the build system Meson~\cite{meson} was used in this project to
-building.
+facilitate building.
 \subsection{Running the Application} \label{ssec:running}
@@ -216,7 +219,8 @@ directory has to contain the following three files:
        iterations each Learner will perform before sending the weights back to
        the Dispatcher.
-      \item \verb|"bs"| --- The number of context windows in a training batch.
+      \item \verb|"bs"| --- Batch Size, the number of context windows in a
        training batch.
      \item \verb|"target"| --- The targeted value of the neural network loss
        function evaluated on the testing dataset. As soon as this value is
@@ -297,49 +301,50 @@ Batcher to stop too.
 up their indices in the vocabulary by calling the \verb|vocab_idx_of(Word* w)|
 function defined in \verb|bridge.pyx|. That function performs a dictionary
 lookup for the word, based on the \verb|config/vocab.txt| file, and returns its
-index on success or \verb|-1| if the word is not known. The Filter will assemble the
+index on success or \verb|-1| if the word is not known. The Filter will
-indices in a \verb|long* windows| until enough words are received to send the
+assemble the indices in a \verb|long* window| variable until enough words are
-context window to the Batcher. If a word received from the Tokenizer is empty,
+received to send the context window to the Batcher. If a word received from the
-the Filter sets the first element in the context window to \verb|-1| and sends the
+Tokenizer is empty, the Filter sets the first element in the context window to
-window to the Batcher for termination.
+\verb|-1| and sends the window to the Batcher for termination.
 \paragraph{Batcher} A Batcher is a rather simple pure C routine, that first
 assembles the context windows into a batch, simultaneously converting
-\verb|long| into \verb|float|, and then waits for a Learner to announce itself.
+\verb|long| into \verb|float|, and then waits for some Learner to announce
-Once it receives a signal from a Learner it responds with a batch and starts
+itself. Once it receives a signal from a Learner it responds with a batch and
-assembling the next batch. Since this node may receive signals from both Filter
+starts assembling the next batch. Since this node may receive signals from both
-and Learner, it also may need to receive termination signals from both in order
+Filter and Learner, it also may need to receive termination signals from both
-to avoid waiting for a signal from a finished process. Therefore, if the first
+in order to avoid waiting for a signal from a finished process. Therefore, if
-element of the received window from the Tokenizer is \verb|-1|, or if the Learner
+the first element of the received window from the Tokenizer is \verb|-1|, or if
-sends \verb|-1| when announcing itself, then the Batcher will terminate immediately.
+the Learner sends \verb|-1| when announcing itself, then the Batcher will
 terminate immediately.
 \paragraph{Learner} A Learner, implemented in \verb|learner| function in
-\verb|main.c| first creates a TensorFlow neural network object, by using
+\verb|main.c| first creates a TensorFlow neural network object and stores the
-\verb|bridge.pyx| as a bridge to the \verb|library.py|, and stores the network
+network as a \verb|PyObject*|. It also initializes a C \verb|WeightList| struct
-as a \verb|PyObject*|, defined in \verb|Python.h|. It also initializes a C
+to store the network weights and to serve as a buffer for communication with
-\verb|WeightList| struct to store the network weights and to serve as a buffer
+the Dispatcher. It then waits for the Dispatcher to announce a new training
-for communication with the Dispatcher. It then waits for the Dispatcher to
+round, after which the Dispatcher will send the weights and the Learner will
-announce a new training round, after which the Dispatcher will send the weights
+receive the weights into the \verb|WeightList| struct. Since a
-and the Learner will receive the weights into the \verb|WeightList| struct.
+\verb|WeightList| has a rather complex structure, a pair of functions
 Since a \verb|WeightList| has a rather complex structure, a pair of functions
 \verb|send_weights| and \verb|recv_weights| are used for communicating the
 weights. Then, the Learner will use the \verb|WeightList| to set the neural
 network weights, by employing the \verb|set_net_weights| function defined in
 \verb|bridge.pyx|. This is one of the cases where it is particularly convenient
 to use Cython, since raw C memory pointers can be easily converted to
-\verb|NumPy| arrays, which one then can directly use to set the network's
+\verb|NumPy| arrays, which one then can directly use to set the weights of a
-weights. Then, the Learner will perform a number of training iterations,
+TensorFlow network. Then, the Learner will perform a number of training
-specified by \verb|"bpe"| key in \verb|config/cfg.json| file. For each
+iterations, specified by \verb|"bpe"| key in \verb|config/cfg.json| file. For
-iteration, the Learner will send its MPI id to its designated Batcher and will
+each iteration, the Learner will send its MPI id to its designated Batcher and
-receive a batch in form of a \verb|float*|. This \verb|float*|, together with
+will receive a batch in form of a \verb|float*|. This \verb|float*|, together
-the \verb|PyObject*| network object can be passed to the \verb|step_net| Cython
+with the \verb|PyObject*| network object can be passed to the \verb|step_net|
-function to perform one step of training. This function, again, leverages the
+Cython function to perform one step of training. This function, again,
-ease of converting C data into NumPy arrays in Cython. Finally, after all
+leverages the ease of converting C data into NumPy arrays in Cython. Finally,
-iterations, the weights of the network will be written to the \verb|WeightList|
+after all iterations, the weights of the network will be written to the
-by a Cython routine \verb|update_weightlist| and the \verb|WeightList| will be
+\verb|WeightList| by a Cython routine \verb|update_weightlist| and the
-sent back to the Dispatcher, and the Learner will wait for the signal to start
+\verb|WeightList| will be sent back to the Dispatcher, and the Learner will
-the next training round. If it instead receives a signal to stop training, then
+wait for the signal to start the next training round. If it instead receives a
-it will send a \verb|-1| to its designated Batcher and terminate.
+signal to stop training, then it will send a \verb|-1| to its designated
 Batcher and terminate.
 \paragraph{Dispatcher} The Dispatcher also initializes a neural network and a
 \verb|WeightList| structure using the same procedure as the Learner. This
@@ -362,36 +367,27 @@ statistics and exit.
 \section{Evaluation}
 The main focus of evaluation was to determine if executing several neural
-network training nodes in parallel can speed-up the training process. The first
+network training nodes in parallel can speed-up the training process. The
-attempt to quantify performance was to train for a specified amount of training
+employed approach was to define a \textit{target loss} that the network has to
-rounds and compare the final loss, the average loss decrease per training
+achieve and then to measure \textit{the number of context windows} that each
-round, and the average loss decrease per second for system configurations with
+Learner node has to process and, secondarily, the time it takes for the system
-different number of Learner nodes. The problem with this approach, however, is
+to reach the target. The motivation behind this approach is that although the
-that the loss curve doesn't have a linear shape when plotted against the number
+total number of training windows consumed by the system is the number of
-of training iterations, with usually a strong slope in the beginning of the
+windows for each Learner times the number of Learners, the Learners process
-training and then almost flat after some iterations, and is therefore a poor
+their windows in parallel, thus the longest computation path is as long as the
-approximation for the \textit{time} it takes to train a neural network.
+number of windows that each Learner processes, which is a reasonable
-
+approximation for parallel performance. Moreover, the tests have shown that the
-Therefore, another approach was employed, which is to define a \textit{target
+training steps dominate the running time (the pipeline with a single Learner
-  loss} that the network has to achieve and then to measure \textit{the number
+could process around 45 batches/s, but over 500 batches/s when the call to the
-  of training windows} that each Learner node has to process and also the time
+training function was commented out), therefore the number of context windows
-it takes for the system to reach the target. The motivation behind this
+processed by Learners is the most important parameter for the overall
-approach is that although the total number of training window consumed by the
+performance. It is also possible to count the processed batches and not the
-system is the number of windows for each Learner times the number of Learners,
+context windows, however it may be interesting to compare the influence of the
-the Learners process their windows in parallel, thus the longest computation
+number of the context windows in a batch (i.e.\@ the \textit{batch size}) on
-path is as long as the number of windows that each Learner processes, which is
+the training performance, such that e.g.\@ increasing the batch size might
 a reasonable approximation for parallel performance. Moreover, the tests have
 shown that the training steps dominate the running time (the pipeline with a
 single Learner could process around 45 batches/s, but over 500 batches/s when
 the call to the training function was commented out), therefore the number of
 context windows processed by Learners is the most important parameter for the
 overall performance. It is also possible to count the processed batches and not
 the context windows, however it may be interesting to compare the influence of
 the number of the context windows in a batch (i.e.\@ the \textit{batch size})
 on the training performance, such that e.g.\@ increasing the batch size might
 actually reduce the amount of data needed for training.
-Finally, the wall time was only used as a secondary measure, since due to time
+The wall time was only used as a secondary measure, since due to time
 constraints and software incompatibility it was not possible to launch the
 system on the computing cluster, so the tests had to be performed on a laptop
 with a modest double core 1.3 GHz CPU, which means that using more than 2
@@ -415,43 +411,65 @@ context windows were randomly sampled from the dump file.
 The test configurations were:
 \begin{itemize}
-  \item A single pipeline with 1, 2, 4, 8, 12, 16 Learners;
+  \item a single pipeline with 1, 2, 4, 8, 12 Learners (up to 17 total
-  \item or individual pipelines for 1, 2, 4 Learners, each reading a separate
+    processes);
-part of a dataset.
+  \item or individual pipelines for 1, 2, 4, 8 Learners, each reading a
    separate part of a dataset (up to 33 total processes).
 \end{itemize}
-For the smaller of the two datasets the target was set to 8.40, and it can be
+For the smaller of the two datasets the target was set to \verb|8.4|, and it
-observed in \autoref{fig:moby}, that modest speedups can be achieved
+can be observed in \autoref{fig:datasets}, that modest speedups can be achieved
-when going from 1 Learner to 2 or 4 learners; employing 8 Learners or more,
+when going from 1 Learner to 2 or 4 Learners; employing 8 Learners or more,
 however, doesn't result in any further improvement, with the system maxing out
 on 1.6x speed up. A possible explanation for this is that the ``Moby Dick''
-book is too small to for the network to learn something meaningful and
+book is too small for multiple Learners to have sufficient data to train on.
 therefore the validation loss of 8.40 is the best that can be achieved, which
 can be done fairly quickly even with one Learner node.
-For the larger dataset with the target set to 8.30, however, the results were
+For the larger dataset with the target set to \verb|8.3|, however, the results
-more promising, as can be seen in \autoref{fig:wiki}. Using 2 Learners instead
+were more promising, as can be seen in \autoref{fig:datasets} and
-of 1 resulted in superlinear reduction of both the amount of data consumed by
+\autoref{fig:speedups}. Using 2 Learners instead of 1 resulted in nearly linear
-each Learner (2.18x) and time to target (2.14x), which cannot be trivially
+reduction of both the amount of data consumed by each Learner (1.95x) and time
-explained and probably has to do something with the particularities of the
+to target (1.94x). This result also validates the use of the number of context
-training algorithm and the training data. This result also validates the use of
+windows consumed by each Learner as a proxy for system performance, since
-the number of context windows consumed by each Learner as a proxy for system
+scaling within the number of available cores results in an almost perfect
-performance, since scaling within the number of available cores results in an
+correlation between the amount of data per Learner and the wall time. Going
-almost perfect correlation between the amount of data per Learner and the wall
+from 2 to 4 Learners decreases the amount of data per Learner by another 2x,
-time. Going from 2 to 4 Learners decreases the amount of data per Learner by
+with the wall time remaining roughly the same, demonstrating the core depletion
-another 1.7x, with the wall time remaining the same, demonstrating the core
+on the laptop. Further increasing the number of Learner nodes results in
-depletion on the laptop. Further increasing the number of learner nodes results
+observable, but sub-linear speedups, with the 12 Learner System using 7x less
-in observable, but sub-linear speedups, with the 12 Learner System using 7x
+data per Learner to achieve the target loss of \verb|8.3|. This decrease in
-less data per Learner. This decrease in gains can probably be linked to the
+gains can probably be linked to the deficiencies of the neural network model
-deficiencies of the neural network model being used, and thus, to achieve
+being used, and thus, to achieve further speed-ups, the network architecture
-further speed-ups, the network architecture has to be investigated in more
+and training hyperparameters has to be investigated in more depth. Furthermore,
-depth.
+the loss plots suggest that for longer training the difference between
 configurations with different number of Learners should still be observable,
 however, due to time and hardware constraints it was not possible to
 investigate the speed-ups achieved in longer running trials in more detail.
-Finally, as demonstrated in \autoref{fig:moby, fig:dick}, the systems with
+Finally, as can be observed in \autoref{fig:datasets} and
-individual independent pipelines for each learner perform and scale worse than
+\autoref{fig:speedups}, the systems with individual pipelines with independent
-the single-pipeline systems. However, the trend for scaling is still visible
+input data for each Learner initially perform and scale worse than the
-and provides evidence that that training is possible even when non-IID
+single-pipeline systems. However, in the later stages of training the effect of
-heterogeneous data is available to each individual Learner.
+using multiple pipelines becomes more positive, e.g.\@ the
 \mbox{4 Learner -- 4 Pipeline} system almost catches up with the
 \mbox{12 Learner -- 1 Pipeline}
 system. Since input pipelines are computationally cheap, and it is
 computationally viable not to store the data as one big file but rather have it
 split across multiple nodes, this mode of operation should be investigated
 further and possibly preferred for large-scale training.
 \begin{figure}
  \centering
  \includegraphics[width=\linewidth]{fig/datasets.pdf}
  \caption{Validation Loss Against the Amount of Data per Learner}
  \label{fig:datasets}
 \end{figure}
 \begin{figure}
  \centering
  \includegraphics[width=\linewidth]{fig/speedups.pdf}
  \caption{Scalability Results with the English Wikipedia Dataset}
  \label{fig:speedups}
 \end{figure}
 \section{Conclusion and Future Works}
@@ -465,13 +483,12 @@ two parts. The drawbacks of this approach are that the full Python interpreter
 still gets embedded into the C application, and, furthermore, some parts of
 Python, such as the \verb|multiprocessing| module, result in failures when
 embedded into a C application, which prohibits to use some Python libraries
-like \textit{scikit-learn} or \textit{NLTK} that use \verb|multiprocessing|
+like NLTK that use \verb|multiprocessing| internally.
 internally.
 Another major accomplishment is the creation of a modular distributed Deep
 Learning architecture for a basic NLP task, which can be further expanded to
 compute higher level problems, like word prediction or sentiment analysis.
-Furthermore, this results of the tests show that there can be significant
+Furthermore, the results of the tests show that there can be significant
 improvements in terms of training times if the training is performed on
 multiple nodes in parallel, even with independent data on each node.
@@ -480,7 +497,11 @@ system currently uses CPU for neural network training, which is inefficient.
 Therefore, it might be interesting to investigate whether MPI can be used to
 distribute the system across the cluster of GPU-equipped nodes. Furthermore,
 the architecture of the neural network probably requires some fine-tuning to
-achieve better scalability, as reported in~\cite{fedavg}. Finally, an
+achieve better scalability, as reported in~\cite{fedavg}. It would also be
 interesting to investigate finer-grain parallelism with FG-MPI~\cite{fg-mpi},
 especially for the input pipeline, since the pipeline nodes are rather too
 lightweight for each of them to occupy a separate process, and therefore the
 coroutine-based parallelism might be a better fit in this case. Finally, an
 interesting direction would be to split the neural networks across multiple
 nodes, with one neural network layer occupying one node (e.g.\@ as
 in~\cite{syngrad}), which might distribute the computational load across the
		`@@ -1 +1 @@`
			<mxfile modified="2019-12-15T17:12:53.280Z" host="" agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) draw.io/11.1.1 Chrome/76.0.3809.88 Electron/6.0.0 Safari/537.36" etag="wUOw0lOGHGc77jKsLJXB" version="11.1.1" type="device"><diagram id="HbwVHqc1XiIdVxvrsfh9" name="Page-1">7Vxbc5s8EP01fuw3NiCwH/sl6WUmnbaT6fTy0lFBMbQYMUKu7f76CiMZg4ghdqyLm5cMWi8Yds/Rrg6KR+7VYv2awDx+hyOUjpxxtB651yPHmXk++1saNpUhcILKMCdJVJkmteEu+YO4ccytyyRCRcORYpzSJG8aQ5xlKKQNGyQEr5pu9zhtfmsO50gy3IUwla2fk4jGlXUKxrX9DUrmsfjmyZh/soDCmRuKGEZ4tWdyb0buFcGYVkeL9RVKy9iJuFTnvXrg092NEZTRISf8oquP928/pd+SYv0eBS++f3u3eSEuU9CNeGIUsQDwISY0xnOcwfSmtv5P8DKLUHnZMRvVPrcY58w4YcafiNINzyZcUsxMMV2k/FO0TuiXveOv5aX+cwAfXq/5pbeDjRhklGy+7A+q04AY1qdtR+K86gHLp2oErsBLEqJDoQEcbpDMET3kGMipmOwSzIiB8AKxO2IuBKWQJr+bdwI5ROc7P37qS0LgZs8hx0lGi70rfygNzIGTbUcazrVJ0EJEj/+s4c4OqhsQo70nqU1bkD0GcI45gLMUb24H9fXgzWnhbdqDt7Y/UAE49xlwp05wswsBnJjwzgs4zxzABZYiznVNQVy7pM4eV1KdQAXieFh/w3TJ4/A2y5flbX1IcpQmGZIg2QTcKk4ousvhNnsr1sg3wcUvjwhF6we7zwfyIxYCrTj6fLyqm+pd6xzvNdTe+OGM7kX1iKD55tD0OJY6fTQtGM/oy3IRxAxhCosiCYX5VZI2s3sEm4OhbPYNYbPrHmp4e90nSsgcSGS+RZBkiGjn8I6MYi0vc9hXSuGp7RTWzeCZdQw+3NH1+rc6xjNReGYNhUU/o43DYop95vCRHBZqgL0cbskAZnBYhNUGDru6OWy9phfoJrFrHYndgwvd/l5ahdgswmoBiR2gm8QGyVZmqVbuYJ0UPD05T8upLAxJSS5imJeH4SZNWC5JP/p/VEm//bEzwPDXfAuF90u6VZuejib9spGnlCW+FNHrpMghDWMDJhUX9IcrUBouWcz4vP3ewjIcSpF1NEcWgEufrhstV4bLbDb6rQNTupTXAbO85wyc5T1P55QubtNkrR+0er0OnVCt1u9Z/w5Yt9bveQPZAUx5cxcETRD2aP1tdyVav4iqgesTf2qW1u/9W/XuDAz2rWOw31oi92j9bX8lOqEnLwhMpbB2rd8Lnjl8GoentnO4T+vXw+GpPRzWrfV7M9s5rFvrF0qCRSRu9cZ9Wr/US6vQ+kVYLSCxdq0fGPRfH2Zp/WCwCuQ8PTlPy6ksDElJNltj9fzeJadSrR/Ibw8N0vqDATtq1SrSsphhp9YvRVa31u//W9rH+bV+MFQLqVYG2qZ0WVswXuvvWqCoFfvBxasMRzc3g+WDM+wyOi2n8grdsqLSbm66eKK0u/Gt//8X04rK0M30lWSsjUry5nTzi0qHcq22qAj2Xi5bji0q/tD95wJ4xhQVcecXVFQ6eKK2qFx876W4qPhD9ahKUtRGJVl8Mr6odKmwiouK9duSzlZUhm43EsAzp6jIKpjlRaWLJ2qLyoDF36MmkwgW8da3HNyzefsKp5jU8/kTBLH1MxBd29adjhieL4jBgGbHtCB6QF0U2bD+KbbqnWT9e3buzV8=</diagram></mxfile>				<mxfile modified="2019-12-15T18:24:15.861Z" host="" agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) draw.io/11.1.1 Chrome/76.0.3809.88 Electron/6.0.0 Safari/537.36" etag="JZI0mDDl2882QxsvCQuo" version="11.1.1" type="device"><diagram id="HbwVHqc1XiIdVxvrsfh9" name="Page-1">7Vxbc5s8EP01fuw3NiCwH/sl6WUmnbaT6fTy0lFBMbQYMUKu7f76CiMZg4ghdqyLm5cMWi8Yds/Rrg6KR+7VYv2awDx+hyOUjpxxtB651yPHmXk++1saNpUhcILKMCdJVJkmteEu+YO4ccytyyRCRcORYpzSJG8aQ5xlKKQNGyQEr5pu9zhtfmsO50gy3IUwla2fk4jGlXUKxrX9DUrmsfjmyZh/soDCmRuKGEZ4tWdyb0buFcGYVkeL9RVKy9iJuFTnvXrg092NEZTRISf8oquP928/pd+SYv0eBS++f3u3eSEuU9CNeGIUsQDwISY0xnOcwfSmtv5P8DKLUHnZMRvVPrcY58w4YcafiNINzyZcUsxMMV2k/FO0TuiXveOv5aX+cwAfXq/5pbeDjRhklGy+7A+q04AY1qdtR+K86gHLp2oErsBLEqJDoQEcbpDMET3kGMipmOwSzIiB8AKxO2IuBKWQJr+bdwI5ROc7P37qS0LgZs8hx0lGi70rfygNzIGTbUcazrVJ0EJEj/+s4c4OqhsQo70nqU1bkD0GcI45gLMUb24H9fXgzWnhbdqDt7Y/UAE49xlwp05wswsBnJjwzgs4zxzABZYiznVNQVy7pM4eV1KdQAXieFh/w3TJ4/A2y5flbX1IcpQmGZIg2QTcKk4ousvhNnsr1sg3wcUvjwhF6we7zwfyIxYCrTj6fLyqm+pd6xzvNdTe+OGM7kX1iKD55tD0OJY6fTQtGM/oy3IRxAxhCosiCYX5VZI2s3sEm4OhbPYNYbPrHmp4e90nSsgcSGS+RZBkiGjn8I6MYi0vc9hXSuGp7RTWzeCZdQw+3NH1+rc6xjNReGYNhUU/o43DYop95vCRHBZqgL0cbskAZnBYhNUGDru6OWy9phfoJrFrHYndgwvd/l5ahdgswmoBiR2gm8QGyVZmqVbuYJ0UPD05T8upLAxJSS5imJeH4SZNWC5JP/p/VEm//bEzwPDXfAuF90u6VZuejib9spGnlCW+FNHrpMghDWMDJhUX9IcrUBouWcz4vP3ewjIcSpF1NEcWgEufrhstV4bLbDb6rQNTupTXAbO85wyc5T1P55QubtNkrR+0er0OnVCt1u9Z/w5Yt9bveQPZAUx5cxcETRD2aP1tdyVav4iqgesTf2qW1u/9W/XuDAz2rWOw31oi92j9bX8lOqEnLwhMpbB2rd8Lnjl8GoentnO4T+vXw+GpPRzWrfV7M9s5rFvrF0qCRSRu9cZ9Wr/US6vQ+kVYLSCxdq0fGPRfH2Zp/WCwCuQ8PTlPy6ksDElJNltj9fzeJadSrR/Ibw8N0vqDATtq1SrSsphhp9YvRVa31u//W9rH+bV+MFQLqVYG2qZ0WVswXuvvWqCoFfvBxasMRzc3g+WDM+wyOi2n8grdsqLSbm66eKK0u/Gt//8X04rK0M30lWSsjUry5nTzi0qHcq22qAj2Xi5bji0q/tD95wJ4xhQVcecXVFQ6eKK2qFx876W4qPhD9ahKUtRGJVl8Mr6odKmwiouK9duSzlZUhm43EsAzp6jIKpjlRaWLJ2qLyoDF36MmkwgW8da3HNyzefsKp5jU8/kTBLH1MxBd29adjhieL4jBgGbHtCB6QF0U2bD+KbbqnWT9e3buzV8=</diagram></mxfile>