ya sdelal'
This commit is contained in:
@@ -1 +1 @@
|
|||||||
<mxfile modified="2019-12-15T17:12:53.280Z" host="" agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) draw.io/11.1.1 Chrome/76.0.3809.88 Electron/6.0.0 Safari/537.36" etag="wUOw0lOGHGc77jKsLJXB" version="11.1.1" type="device"><diagram id="HbwVHqc1XiIdVxvrsfh9" name="Page-1">7Vxbc5s8EP01fuw3NiCwH/sl6WUmnbaT6fTy0lFBMbQYMUKu7f76CiMZg4ghdqyLm5cMWi8Yds/Rrg6KR+7VYv2awDx+hyOUjpxxtB651yPHmXk++1saNpUhcILKMCdJVJkmteEu+YO4ccytyyRCRcORYpzSJG8aQ5xlKKQNGyQEr5pu9zhtfmsO50gy3IUwla2fk4jGlXUKxrX9DUrmsfjmyZh/soDCmRuKGEZ4tWdyb0buFcGYVkeL9RVKy9iJuFTnvXrg092NEZTRISf8oquP928/pd+SYv0eBS++f3u3eSEuU9CNeGIUsQDwISY0xnOcwfSmtv5P8DKLUHnZMRvVPrcY58w4YcafiNINzyZcUsxMMV2k/FO0TuiXveOv5aX+cwAfXq/5pbeDjRhklGy+7A+q04AY1qdtR+K86gHLp2oErsBLEqJDoQEcbpDMET3kGMipmOwSzIiB8AKxO2IuBKWQJr+bdwI5ROc7P37qS0LgZs8hx0lGi70rfygNzIGTbUcazrVJ0EJEj/+s4c4OqhsQo70nqU1bkD0GcI45gLMUb24H9fXgzWnhbdqDt7Y/UAE49xlwp05wswsBnJjwzgs4zxzABZYiznVNQVy7pM4eV1KdQAXieFh/w3TJ4/A2y5flbX1IcpQmGZIg2QTcKk4ousvhNnsr1sg3wcUvjwhF6we7zwfyIxYCrTj6fLyqm+pd6xzvNdTe+OGM7kX1iKD55tD0OJY6fTQtGM/oy3IRxAxhCosiCYX5VZI2s3sEm4OhbPYNYbPrHmp4e90nSsgcSGS+RZBkiGjn8I6MYi0vc9hXSuGp7RTWzeCZdQw+3NH1+rc6xjNReGYNhUU/o43DYop95vCRHBZqgL0cbskAZnBYhNUGDru6OWy9phfoJrFrHYndgwvd/l5ahdgswmoBiR2gm8QGyVZmqVbuYJ0UPD05T8upLAxJSS5imJeH4SZNWC5JP/p/VEm//bEzwPDXfAuF90u6VZuejib9spGnlCW+FNHrpMghDWMDJhUX9IcrUBouWcz4vP3ewjIcSpF1NEcWgEufrhstV4bLbDb6rQNTupTXAbO85wyc5T1P55QubtNkrR+0er0OnVCt1u9Z/w5Yt9bveQPZAUx5cxcETRD2aP1tdyVav4iqgesTf2qW1u/9W/XuDAz2rWOw31oi92j9bX8lOqEnLwhMpbB2rd8Lnjl8GoentnO4T+vXw+GpPRzWrfV7M9s5rFvrF0qCRSRu9cZ9Wr/US6vQ+kVYLSCxdq0fGPRfH2Zp/WCwCuQ8PTlPy6ksDElJNltj9fzeJadSrR/Ibw8N0vqDATtq1SrSsphhp9YvRVa31u//W9rH+bV+MFQLqVYG2qZ0WVswXuvvWqCoFfvBxasMRzc3g+WDM+wyOi2n8grdsqLSbm66eKK0u/Gt//8X04rK0M30lWSsjUry5nTzi0qHcq22qAj2Xi5bji0q/tD95wJ4xhQVcecXVFQ6eKK2qFx876W4qPhD9ahKUtRGJVl8Mr6odKmwiouK9duSzlZUhm43EsAzp6jIKpjlRaWLJ2qLyoDF36MmkwgW8da3HNyzefsKp5jU8/kTBLH1MxBd29adjhieL4jBgGbHtCB6QF0U2bD+KbbqnWT9e3buzV8=</diagram></mxfile>
|
<mxfile modified="2019-12-15T18:24:15.861Z" host="" agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) draw.io/11.1.1 Chrome/76.0.3809.88 Electron/6.0.0 Safari/537.36" etag="JZI0mDDl2882QxsvCQuo" version="11.1.1" type="device"><diagram id="HbwVHqc1XiIdVxvrsfh9" name="Page-1">7Vxbc5s8EP01fuw3NiCwH/sl6WUmnbaT6fTy0lFBMbQYMUKu7f76CiMZg4ghdqyLm5cMWi8Yds/Rrg6KR+7VYv2awDx+hyOUjpxxtB651yPHmXk++1saNpUhcILKMCdJVJkmteEu+YO4ccytyyRCRcORYpzSJG8aQ5xlKKQNGyQEr5pu9zhtfmsO50gy3IUwla2fk4jGlXUKxrX9DUrmsfjmyZh/soDCmRuKGEZ4tWdyb0buFcGYVkeL9RVKy9iJuFTnvXrg092NEZTRISf8oquP928/pd+SYv0eBS++f3u3eSEuU9CNeGIUsQDwISY0xnOcwfSmtv5P8DKLUHnZMRvVPrcY58w4YcafiNINzyZcUsxMMV2k/FO0TuiXveOv5aX+cwAfXq/5pbeDjRhklGy+7A+q04AY1qdtR+K86gHLp2oErsBLEqJDoQEcbpDMET3kGMipmOwSzIiB8AKxO2IuBKWQJr+bdwI5ROc7P37qS0LgZs8hx0lGi70rfygNzIGTbUcazrVJ0EJEj/+s4c4OqhsQo70nqU1bkD0GcI45gLMUb24H9fXgzWnhbdqDt7Y/UAE49xlwp05wswsBnJjwzgs4zxzABZYiznVNQVy7pM4eV1KdQAXieFh/w3TJ4/A2y5flbX1IcpQmGZIg2QTcKk4ousvhNnsr1sg3wcUvjwhF6we7zwfyIxYCrTj6fLyqm+pd6xzvNdTe+OGM7kX1iKD55tD0OJY6fTQtGM/oy3IRxAxhCosiCYX5VZI2s3sEm4OhbPYNYbPrHmp4e90nSsgcSGS+RZBkiGjn8I6MYi0vc9hXSuGp7RTWzeCZdQw+3NH1+rc6xjNReGYNhUU/o43DYop95vCRHBZqgL0cbskAZnBYhNUGDru6OWy9phfoJrFrHYndgwvd/l5ahdgswmoBiR2gm8QGyVZmqVbuYJ0UPD05T8upLAxJSS5imJeH4SZNWC5JP/p/VEm//bEzwPDXfAuF90u6VZuejib9spGnlCW+FNHrpMghDWMDJhUX9IcrUBouWcz4vP3ewjIcSpF1NEcWgEufrhstV4bLbDb6rQNTupTXAbO85wyc5T1P55QubtNkrR+0er0OnVCt1u9Z/w5Yt9bveQPZAUx5cxcETRD2aP1tdyVav4iqgesTf2qW1u/9W/XuDAz2rWOw31oi92j9bX8lOqEnLwhMpbB2rd8Lnjl8GoentnO4T+vXw+GpPRzWrfV7M9s5rFvrF0qCRSRu9cZ9Wr/US6vQ+kVYLSCxdq0fGPRfH2Zp/WCwCuQ8PTlPy6ksDElJNltj9fzeJadSrR/Ibw8N0vqDATtq1SrSsphhp9YvRVa31u//W9rH+bV+MFQLqVYG2qZ0WVswXuvvWqCoFfvBxasMRzc3g+WDM+wyOi2n8grdsqLSbm66eKK0u/Gt//8X04rK0M30lWSsjUry5nTzi0qHcq22qAj2Xi5bji0q/tD95wJ4xhQVcecXVFQ6eKK2qFx876W4qPhD9ahKUtRGJVl8Mr6odKmwiouK9duSzlZUhm43EsAzp6jIKpjlRaWLJ2qLyoDF36MmkwgW8da3HNyzefsKp5jU8/kTBLH1MxBd29adjhieL4jBgGbHtCB6QF0U2bD+KbbqnWT9e3buzV8=</diagram></mxfile>
|
||||||
123
docs/generate_plots.py
Normal file
123
docs/generate_plots.py
Normal file
@@ -0,0 +1,123 @@
|
|||||||
|
import os
|
||||||
|
import re
|
||||||
|
from math import floor, ceil
|
||||||
|
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
|
HERE = os.path.abspath(os.path.dirname(__file__))
|
||||||
|
LOGS = os.path.join(HERE, '../../docs/logs/')
|
||||||
|
|
||||||
|
|
||||||
|
datasets = {
|
||||||
|
'moby': {
|
||||||
|
'idx': 0,
|
||||||
|
'name': 'Moby Dick (~200k words)',
|
||||||
|
'target': 8.4,
|
||||||
|
'lim': (16000, 320000)
|
||||||
|
},
|
||||||
|
'wiki': {
|
||||||
|
'name': 'English Wikipedia (~90M words)',
|
||||||
|
'idx': 1,
|
||||||
|
'target': 8.3,
|
||||||
|
'lim': (16000, 360000)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def s(n):
|
||||||
|
return 's' if n > 1 else ''
|
||||||
|
|
||||||
|
|
||||||
|
def idx_of(l, cond=lambda x: x):
|
||||||
|
try:
|
||||||
|
return next(i for i, e in enumerate(l) if cond(e))
|
||||||
|
except StopIteration:
|
||||||
|
return -1
|
||||||
|
|
||||||
|
|
||||||
|
def meta_from_fn(fn):
|
||||||
|
m = re.search(r'(.+)_(\d+)_learner_(\d+)_pp', fn)
|
||||||
|
return (lambda x: (x[0], int(x[1]), int(x[2])))(
|
||||||
|
m.group(1,2,3)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
files = sorted(os.listdir(LOGS), key= lambda x: meta_from_fn(x)[1])
|
||||||
|
|
||||||
|
fig = plt.figure(figsize=(10, 4))
|
||||||
|
fig.subplots_adjust(left=0.06, right=0.99, top=0.91, wspace=0.18)
|
||||||
|
axs = fig.subplots(1, len(datasets))
|
||||||
|
pp_speedup = []
|
||||||
|
l_speedup = []
|
||||||
|
|
||||||
|
for fn in files:
|
||||||
|
name, learners, pipelines = meta_from_fn(fn)
|
||||||
|
if learners == 16:
|
||||||
|
continue
|
||||||
|
with open(os.path.join(LOGS, fn)) as f:
|
||||||
|
lines = f.readlines()
|
||||||
|
matches = [re.search(r'windows (\d+) validation loss (\d+\.\d+)', l)
|
||||||
|
for l in lines]
|
||||||
|
matches = [m for m in matches if m is not None]
|
||||||
|
win_loss = [
|
||||||
|
(lambda x: (int(x[0]), float(x[1])))(m.group(1, 2)) for m in matches
|
||||||
|
]
|
||||||
|
windows, loss = zip(*win_loss)
|
||||||
|
axs[datasets[name]['idx']].plot(
|
||||||
|
windows[1:], loss[1:], linestyle='-' * (1 + (pipelines>1)),
|
||||||
|
color=f'C{learners // 2}',
|
||||||
|
label=f'{learners} Learner{s(learners)},'
|
||||||
|
f' {pipelines} Pipeline{s(pipelines)}'
|
||||||
|
)
|
||||||
|
ttt = windows[idx_of(loss, lambda l: l < datasets[name]['target'])]
|
||||||
|
if name == 'wiki':
|
||||||
|
if pipelines > 1 or learners == 1:
|
||||||
|
pp_speedup.append((pipelines, ttt))
|
||||||
|
if pipelines == 1:
|
||||||
|
l_speedup.append((learners, ttt))
|
||||||
|
|
||||||
|
for d in datasets.values():
|
||||||
|
a = axs[d['idx']]
|
||||||
|
a.set_xlabel('Context Windows per Learner')
|
||||||
|
a.set_ylabel('Validation Loss')
|
||||||
|
a.set_xticks([windows[1]] + [*range(0, 300001, 100000)])
|
||||||
|
a.set_xlim(*d['lim'])
|
||||||
|
a.set_title(d['name'])
|
||||||
|
a.legend()
|
||||||
|
a.axhline(d['target'], color='k', linestyle=':')
|
||||||
|
|
||||||
|
fig.savefig(os.path.join(HERE, 'fig/datasets.pdf'))
|
||||||
|
|
||||||
|
def speedup_plot(zipped):
|
||||||
|
factors, time = zip(*sorted(zipped))
|
||||||
|
time = np.asarray(time)
|
||||||
|
speedup = time[0] / time
|
||||||
|
print(factors, time)
|
||||||
|
plt.plot(factors, speedup)
|
||||||
|
plt.xlim(min(factors), max(factors))
|
||||||
|
plt.ylim(min(speedup), max(speedup))
|
||||||
|
plt.xticks([*range(min(factors), max(factors) + 1)])
|
||||||
|
plt.yticks([*range(floor(min(speedup)), ceil(max(speedup)) + 1)])
|
||||||
|
plt.grid()
|
||||||
|
|
||||||
|
fig = plt.figure(figsize=(10, 4))
|
||||||
|
fig.subplots_adjust(left=0.06, right=0.99, top=0.91, wspace=0.18)
|
||||||
|
|
||||||
|
plt.subplot(121)
|
||||||
|
speedup_plot(l_speedup)
|
||||||
|
plt.title('Single Pipeline')
|
||||||
|
plt.xlabel('Number of Learners')
|
||||||
|
plt.ylabel(f'Speedup to Target {datasets["wiki"]["target"]}')
|
||||||
|
|
||||||
|
plt.subplot(122)
|
||||||
|
speedup_plot(pp_speedup)
|
||||||
|
plt.title('Multiple Pipelines')
|
||||||
|
plt.xlabel('Number of Pipelines')
|
||||||
|
plt.ylabel(f'Speedup to Target {datasets["wiki"]["target"]}')
|
||||||
|
|
||||||
|
plt.savefig(os.path.join(HERE, 'fig/speedups.pdf'))
|
||||||
|
plt.show()
|
||||||
147
docs/references.bib
Normal file
147
docs/references.bib
Normal file
@@ -0,0 +1,147 @@
|
|||||||
|
@misc{mpich,
|
||||||
|
title={{MPICH | High-Performance Portable MPI}},
|
||||||
|
url={https://www.mpich.org/}
|
||||||
|
}
|
||||||
|
|
||||||
|
@inproceedings{nltk,
|
||||||
|
author = {Loper, Edward and Bird, Steven},
|
||||||
|
title = {{NLTK}: The Natural Language Toolkit},
|
||||||
|
booktitle = {Proceedings of the ACL-02 Workshop on Effective Tools and
|
||||||
|
Methodologies for Teaching Natural Language Processing and Computational
|
||||||
|
Linguistics --- Volume 1},
|
||||||
|
series = {ETMTNLP '02},
|
||||||
|
year = {2002},
|
||||||
|
pages = {63--70},
|
||||||
|
url = {https://www.nltk.org},
|
||||||
|
}
|
||||||
|
|
||||||
|
@inproceedings{cbow-skip-gram,
|
||||||
|
title={Distributed representations of words and phrases and their
|
||||||
|
compositionality},
|
||||||
|
author={Mikolov, Tomas and Sutskever, Ilya and Chen, Kai and Corrado, Greg S
|
||||||
|
and Dean, Jeff},
|
||||||
|
booktitle={Advances in neural information processing systems},
|
||||||
|
pages={3111--3119},
|
||||||
|
year={2013}
|
||||||
|
}
|
||||||
|
|
||||||
|
@article{fedavg,
|
||||||
|
author = {H. Brendan McMahan and
|
||||||
|
Eider Moore and
|
||||||
|
Daniel Ramage and
|
||||||
|
Blaise Ag{\"{u}}era y Arcas},
|
||||||
|
title = {Federated Learning of Deep Networks using Model Averaging},
|
||||||
|
journal = {CoRR},
|
||||||
|
volume = {abs/1602.05629},
|
||||||
|
year = {2016},
|
||||||
|
}
|
||||||
|
|
||||||
|
@misc{numpy,
|
||||||
|
title = {{NumPy}},
|
||||||
|
url = {https://numpy.org/}
|
||||||
|
}
|
||||||
|
|
||||||
|
@misc{meson,
|
||||||
|
title = {{The Meson Build system}},
|
||||||
|
url = {https://mesonbuild.com/}
|
||||||
|
}
|
||||||
|
|
||||||
|
@misc{gutenberg,
|
||||||
|
title = {{Project Gutenberg}},
|
||||||
|
url = {https://www.gutenberg.org/}
|
||||||
|
}
|
||||||
|
|
||||||
|
@misc{wikidump,
|
||||||
|
title = {{Wikimedia Downloads}},
|
||||||
|
url={https://dumps.wikimedia.org/}
|
||||||
|
}
|
||||||
|
|
||||||
|
@misc{wikiextractor,
|
||||||
|
title = {{WikiExtractor}},
|
||||||
|
url = {https://github.com/attardi/wikiextractor}
|
||||||
|
}
|
||||||
|
|
||||||
|
@misc{10k-words,
|
||||||
|
url = {https://github.com/first20hours/google-10000-english}
|
||||||
|
}
|
||||||
|
|
||||||
|
@article{syngrad,
|
||||||
|
author = {Max Jaderberg and
|
||||||
|
Wojciech Marian Czarnecki and
|
||||||
|
Simon Osindero and
|
||||||
|
Oriol Vinyals and
|
||||||
|
Alex Graves and
|
||||||
|
Koray Kavukcuoglu},
|
||||||
|
title = {Decoupled Neural Interfaces using Synthetic Gradients},
|
||||||
|
journal = {CoRR},
|
||||||
|
volume = {abs/1608.05343},
|
||||||
|
year = {2016},
|
||||||
|
}
|
||||||
|
|
||||||
|
@inproceedings{fg-mpi,
|
||||||
|
author = {Kamal, Humaira and Wagner, Alan},
|
||||||
|
Booktitle = {11th IEEE Intl. Workshop on Parallel and Distributed Scientific and Engineering Computing (PDSEC) held in conjunction with IPDPS-24},
|
||||||
|
month = {April},
|
||||||
|
pages = {1--8},
|
||||||
|
title = {{FG-MPI}: Fine-grain {MPI} for Multicore and Clusters},
|
||||||
|
year = {2010},
|
||||||
|
}
|
||||||
|
|
||||||
|
@misc{tensorflow,
|
||||||
|
title={{TensorFlow}: Large-Scale Machine Learning on Heterogeneous Systems},
|
||||||
|
url={http://tensorflow.org/},
|
||||||
|
author={
|
||||||
|
Mart\'{\i}n~Abadi and
|
||||||
|
Ashish~Agarwal and
|
||||||
|
Paul~Barham and
|
||||||
|
Eugene~Brevdo and
|
||||||
|
Zhifeng~Chen and
|
||||||
|
Craig~Citro and
|
||||||
|
Greg~S.~Corrado and
|
||||||
|
Andy~Davis and
|
||||||
|
Jeffrey~Dean and
|
||||||
|
Matthieu~Devin and
|
||||||
|
Sanjay~Ghemawat and
|
||||||
|
Ian~Goodfellow and
|
||||||
|
Andrew~Harp and
|
||||||
|
Geoffrey~Irving and
|
||||||
|
Michael~Isard and
|
||||||
|
Yangqing Jia and
|
||||||
|
Rafal~Jozefowicz and
|
||||||
|
Lukasz~Kaiser and
|
||||||
|
Manjunath~Kudlur and
|
||||||
|
Josh~Levenberg and
|
||||||
|
Dan~Man\'{e} and
|
||||||
|
Rajat~Monga and
|
||||||
|
Sherry~Moore and
|
||||||
|
Derek~Murray and
|
||||||
|
Chris~Olah and
|
||||||
|
Mike~Schuster and
|
||||||
|
Jonathon~Shlens and
|
||||||
|
Benoit~Steiner and
|
||||||
|
Ilya~Sutskever and
|
||||||
|
Kunal~Talwar and
|
||||||
|
Paul~Tucker and
|
||||||
|
Vincent~Vanhoucke and
|
||||||
|
Vijay~Vasudevan and
|
||||||
|
Fernanda~Vi\'{e}gas and
|
||||||
|
Oriol~Vinyals and
|
||||||
|
Pete~Warden and
|
||||||
|
Martin~Wattenberg and
|
||||||
|
Martin~Wicke and
|
||||||
|
Yuan~Yu and
|
||||||
|
Xiaoqiang~Zheng},
|
||||||
|
year={2015},
|
||||||
|
}
|
||||||
|
|
||||||
|
@article{cython,
|
||||||
|
url = {https://cython.org},
|
||||||
|
year = 2011,
|
||||||
|
month = {mar},
|
||||||
|
volume = {13},
|
||||||
|
number = {2},
|
||||||
|
pages = {31--39},
|
||||||
|
author = {Stefan Behnel and Robert Bradshaw and Craig Citro and Lisandro Dalcin and Dag Sverre Seljebotn and Kurt Smith},
|
||||||
|
title = {Cython: The Best of Both Worlds},
|
||||||
|
journal = {Computing in Science {\&} Engineering}
|
||||||
|
}
|
||||||
@@ -5,6 +5,8 @@
|
|||||||
\usepackage{listings}
|
\usepackage{listings}
|
||||||
\lstset{basicstyle=\ttfamily}
|
\lstset{basicstyle=\ttfamily}
|
||||||
|
|
||||||
|
\renewcommand{\floatpagefraction}{.8}
|
||||||
|
|
||||||
\title{Distributed Natural Language Processing with MPI and Python}
|
\title{Distributed Natural Language Processing with MPI and Python}
|
||||||
\author{Pavel Lutskov for CPSC 521 @ UBC}
|
\author{Pavel Lutskov for CPSC 521 @ UBC}
|
||||||
\begin{document}
|
\begin{document}
|
||||||
@@ -51,7 +53,7 @@ module, which is used internally by NLTK, causes various conflicts when
|
|||||||
incorporating the Python interpreter into a C application. For this reason,
|
incorporating the Python interpreter into a C application. For this reason,
|
||||||
NLTK had to be abandoned, and the focus of the project was shifted towards the
|
NLTK had to be abandoned, and the focus of the project was shifted towards the
|
||||||
distributed Deep Learning-based computation of word embeddings with the help of
|
distributed Deep Learning-based computation of word embeddings with the help of
|
||||||
TensorFlow framework.
|
TensorFlow~\cite{tensorflow} framework.
|
||||||
|
|
||||||
\section{Architecture Overview}
|
\section{Architecture Overview}
|
||||||
|
|
||||||
@@ -93,8 +95,9 @@ window is filled it is sent down the pipeline for training batch assembly. In
|
|||||||
the system implemented in this project a context window of size 5 is used.
|
the system implemented in this project a context window of size 5 is used.
|
||||||
|
|
||||||
In the final stage of the input pipeline, the node called \textit{Batcher}
|
In the final stage of the input pipeline, the node called \textit{Batcher}
|
||||||
accumulates the context windows into batches, which can then be requested by
|
accumulates the context windows into batches, which can then be requested by a
|
||||||
a node containing the neural network for the actual neural network training.
|
\textit{Learner} node containing the neural network for the actual neural
|
||||||
|
network training.
|
||||||
|
|
||||||
The other dimension of the parallelism employed in this system is the
|
The other dimension of the parallelism employed in this system is the
|
||||||
distributed neural network training. In this project, an approach
|
distributed neural network training. In this project, an approach
|
||||||
@@ -123,7 +126,7 @@ more than one input pipeline.
|
|||||||
\begin{figure}[h]
|
\begin{figure}[h]
|
||||||
\centering
|
\centering
|
||||||
\includegraphics[width=\linewidth]{fig/modes.pdf}
|
\includegraphics[width=\linewidth]{fig/modes.pdf}
|
||||||
\caption{Two Configurable Modes of System Operation}
|
\caption{Possible Pipeline Configurations}
|
||||||
\label{fig:modes}
|
\label{fig:modes}
|
||||||
\end{figure}
|
\end{figure}
|
||||||
|
|
||||||
@@ -146,17 +149,17 @@ Finally, the file \verb|bridge.pyx| provides interface functions for the C code
|
|||||||
to access the Python functionality, thus creating a bridge between the
|
to access the Python functionality, thus creating a bridge between the
|
||||||
algorithms and the system aspects. In a \verb|.pyx| file, C and Python code can
|
algorithms and the system aspects. In a \verb|.pyx| file, C and Python code can
|
||||||
be mixed rather freely, with occasional use of some special syntax. This file
|
be mixed rather freely, with occasional use of some special syntax. This file
|
||||||
is translated by the Cython framework into \verb|bridge.c| and \verb|bridge.h|
|
is translated by the Cython~\cite{cython} framework into \verb|bridge.c| and
|
||||||
files. The \verb|bridge.c| is then used as a compilation unit for the final
|
\verb|bridge.h| files. The \verb|bridge.c| is then used as a compilation unit
|
||||||
executable, and the \verb|bridge.h| is included into the \verb|main.c| as a
|
for the final executable, and the \verb|bridge.h| is included into the
|
||||||
header file. In order for the compilation to succeed, the compiler needs to be
|
\verb|main.c| as a header file. In order for the compilation to succeed, the
|
||||||
pointed towards the Python header files, and, since NumPy code is used in
|
compiler needs to be pointed towards the Python header files, and, since NumPy
|
||||||
\verb|bridge.pyx|, to the NumPy header files. Furthermore, the application
|
code is used in \verb|bridge.pyx|, to the NumPy header files. Furthermore, the
|
||||||
needs to be linked against the Python dynamic libraries, which results in the
|
application needs to be linked against the Python dynamic libraries, which
|
||||||
Python interpreter being embedded into the final executable. In order to
|
results in the Python interpreter being embedded into the final executable. In
|
||||||
simplify the compilation process and to make the codebase more portable, the
|
order to simplify the compilation process and to make the codebase more
|
||||||
build system Meson~\cite{meson} was used in this project to facilitate
|
portable, the build system Meson~\cite{meson} was used in this project to
|
||||||
building.
|
facilitate building.
|
||||||
|
|
||||||
\subsection{Running the Application} \label{ssec:running}
|
\subsection{Running the Application} \label{ssec:running}
|
||||||
|
|
||||||
@@ -216,7 +219,8 @@ directory has to contain the following three files:
|
|||||||
iterations each Learner will perform before sending the weights back to
|
iterations each Learner will perform before sending the weights back to
|
||||||
the Dispatcher.
|
the Dispatcher.
|
||||||
|
|
||||||
\item \verb|"bs"| --- The number of context windows in a training batch.
|
\item \verb|"bs"| --- Batch Size, the number of context windows in a
|
||||||
|
training batch.
|
||||||
|
|
||||||
\item \verb|"target"| --- The targeted value of the neural network loss
|
\item \verb|"target"| --- The targeted value of the neural network loss
|
||||||
function evaluated on the testing dataset. As soon as this value is
|
function evaluated on the testing dataset. As soon as this value is
|
||||||
@@ -297,49 +301,50 @@ Batcher to stop too.
|
|||||||
up their indices in the vocabulary by calling the \verb|vocab_idx_of(Word* w)|
|
up their indices in the vocabulary by calling the \verb|vocab_idx_of(Word* w)|
|
||||||
function defined in \verb|bridge.pyx|. That function performs a dictionary
|
function defined in \verb|bridge.pyx|. That function performs a dictionary
|
||||||
lookup for the word, based on the \verb|config/vocab.txt| file, and returns its
|
lookup for the word, based on the \verb|config/vocab.txt| file, and returns its
|
||||||
index on success or \verb|-1| if the word is not known. The Filter will assemble the
|
index on success or \verb|-1| if the word is not known. The Filter will
|
||||||
indices in a \verb|long* windows| until enough words are received to send the
|
assemble the indices in a \verb|long* window| variable until enough words are
|
||||||
context window to the Batcher. If a word received from the Tokenizer is empty,
|
received to send the context window to the Batcher. If a word received from the
|
||||||
the Filter sets the first element in the context window to \verb|-1| and sends the
|
Tokenizer is empty, the Filter sets the first element in the context window to
|
||||||
window to the Batcher for termination.
|
\verb|-1| and sends the window to the Batcher for termination.
|
||||||
|
|
||||||
\paragraph{Batcher} A Batcher is a rather simple pure C routine, that first
|
\paragraph{Batcher} A Batcher is a rather simple pure C routine, that first
|
||||||
assembles the context windows into a batch, simultaneously converting
|
assembles the context windows into a batch, simultaneously converting
|
||||||
\verb|long| into \verb|float|, and then waits for a Learner to announce itself.
|
\verb|long| into \verb|float|, and then waits for some Learner to announce
|
||||||
Once it receives a signal from a Learner it responds with a batch and starts
|
itself. Once it receives a signal from a Learner it responds with a batch and
|
||||||
assembling the next batch. Since this node may receive signals from both Filter
|
starts assembling the next batch. Since this node may receive signals from both
|
||||||
and Learner, it also may need to receive termination signals from both in order
|
Filter and Learner, it also may need to receive termination signals from both
|
||||||
to avoid waiting for a signal from a finished process. Therefore, if the first
|
in order to avoid waiting for a signal from a finished process. Therefore, if
|
||||||
element of the received window from the Tokenizer is \verb|-1|, or if the Learner
|
the first element of the received window from the Tokenizer is \verb|-1|, or if
|
||||||
sends \verb|-1| when announcing itself, then the Batcher will terminate immediately.
|
the Learner sends \verb|-1| when announcing itself, then the Batcher will
|
||||||
|
terminate immediately.
|
||||||
|
|
||||||
\paragraph{Learner} A Learner, implemented in \verb|learner| function in
|
\paragraph{Learner} A Learner, implemented in \verb|learner| function in
|
||||||
\verb|main.c| first creates a TensorFlow neural network object, by using
|
\verb|main.c| first creates a TensorFlow neural network object and stores the
|
||||||
\verb|bridge.pyx| as a bridge to the \verb|library.py|, and stores the network
|
network as a \verb|PyObject*|. It also initializes a C \verb|WeightList| struct
|
||||||
as a \verb|PyObject*|, defined in \verb|Python.h|. It also initializes a C
|
to store the network weights and to serve as a buffer for communication with
|
||||||
\verb|WeightList| struct to store the network weights and to serve as a buffer
|
the Dispatcher. It then waits for the Dispatcher to announce a new training
|
||||||
for communication with the Dispatcher. It then waits for the Dispatcher to
|
round, after which the Dispatcher will send the weights and the Learner will
|
||||||
announce a new training round, after which the Dispatcher will send the weights
|
receive the weights into the \verb|WeightList| struct. Since a
|
||||||
and the Learner will receive the weights into the \verb|WeightList| struct.
|
\verb|WeightList| has a rather complex structure, a pair of functions
|
||||||
Since a \verb|WeightList| has a rather complex structure, a pair of functions
|
|
||||||
\verb|send_weights| and \verb|recv_weights| are used for communicating the
|
\verb|send_weights| and \verb|recv_weights| are used for communicating the
|
||||||
weights. Then, the Learner will use the \verb|WeightList| to set the neural
|
weights. Then, the Learner will use the \verb|WeightList| to set the neural
|
||||||
network weights, by employing the \verb|set_net_weights| function defined in
|
network weights, by employing the \verb|set_net_weights| function defined in
|
||||||
\verb|bridge.pyx|. This is one of the cases where it is particularly convenient
|
\verb|bridge.pyx|. This is one of the cases where it is particularly convenient
|
||||||
to use Cython, since raw C memory pointers can be easily converted to
|
to use Cython, since raw C memory pointers can be easily converted to
|
||||||
\verb|NumPy| arrays, which one then can directly use to set the network's
|
\verb|NumPy| arrays, which one then can directly use to set the weights of a
|
||||||
weights. Then, the Learner will perform a number of training iterations,
|
TensorFlow network. Then, the Learner will perform a number of training
|
||||||
specified by \verb|"bpe"| key in \verb|config/cfg.json| file. For each
|
iterations, specified by \verb|"bpe"| key in \verb|config/cfg.json| file. For
|
||||||
iteration, the Learner will send its MPI id to its designated Batcher and will
|
each iteration, the Learner will send its MPI id to its designated Batcher and
|
||||||
receive a batch in form of a \verb|float*|. This \verb|float*|, together with
|
will receive a batch in form of a \verb|float*|. This \verb|float*|, together
|
||||||
the \verb|PyObject*| network object can be passed to the \verb|step_net| Cython
|
with the \verb|PyObject*| network object can be passed to the \verb|step_net|
|
||||||
function to perform one step of training. This function, again, leverages the
|
Cython function to perform one step of training. This function, again,
|
||||||
ease of converting C data into NumPy arrays in Cython. Finally, after all
|
leverages the ease of converting C data into NumPy arrays in Cython. Finally,
|
||||||
iterations, the weights of the network will be written to the \verb|WeightList|
|
after all iterations, the weights of the network will be written to the
|
||||||
by a Cython routine \verb|update_weightlist| and the \verb|WeightList| will be
|
\verb|WeightList| by a Cython routine \verb|update_weightlist| and the
|
||||||
sent back to the Dispatcher, and the Learner will wait for the signal to start
|
\verb|WeightList| will be sent back to the Dispatcher, and the Learner will
|
||||||
the next training round. If it instead receives a signal to stop training, then
|
wait for the signal to start the next training round. If it instead receives a
|
||||||
it will send a \verb|-1| to its designated Batcher and terminate.
|
signal to stop training, then it will send a \verb|-1| to its designated
|
||||||
|
Batcher and terminate.
|
||||||
|
|
||||||
\paragraph{Dispatcher} The Dispatcher also initializes a neural network and a
|
\paragraph{Dispatcher} The Dispatcher also initializes a neural network and a
|
||||||
\verb|WeightList| structure using the same procedure as the Learner. This
|
\verb|WeightList| structure using the same procedure as the Learner. This
|
||||||
@@ -362,36 +367,27 @@ statistics and exit.
|
|||||||
\section{Evaluation}
|
\section{Evaluation}
|
||||||
|
|
||||||
The main focus of evaluation was to determine if executing several neural
|
The main focus of evaluation was to determine if executing several neural
|
||||||
network training nodes in parallel can speed-up the training process. The first
|
network training nodes in parallel can speed-up the training process. The
|
||||||
attempt to quantify performance was to train for a specified amount of training
|
employed approach was to define a \textit{target loss} that the network has to
|
||||||
rounds and compare the final loss, the average loss decrease per training
|
achieve and then to measure \textit{the number of context windows} that each
|
||||||
round, and the average loss decrease per second for system configurations with
|
Learner node has to process and, secondarily, the time it takes for the system
|
||||||
different number of Learner nodes. The problem with this approach, however, is
|
to reach the target. The motivation behind this approach is that although the
|
||||||
that the loss curve doesn't have a linear shape when plotted against the number
|
total number of training windows consumed by the system is the number of
|
||||||
of training iterations, with usually a strong slope in the beginning of the
|
windows for each Learner times the number of Learners, the Learners process
|
||||||
training and then almost flat after some iterations, and is therefore a poor
|
their windows in parallel, thus the longest computation path is as long as the
|
||||||
approximation for the \textit{time} it takes to train a neural network.
|
number of windows that each Learner processes, which is a reasonable
|
||||||
|
approximation for parallel performance. Moreover, the tests have shown that the
|
||||||
Therefore, another approach was employed, which is to define a \textit{target
|
training steps dominate the running time (the pipeline with a single Learner
|
||||||
loss} that the network has to achieve and then to measure \textit{the number
|
could process around 45 batches/s, but over 500 batches/s when the call to the
|
||||||
of training windows} that each Learner node has to process and also the time
|
training function was commented out), therefore the number of context windows
|
||||||
it takes for the system to reach the target. The motivation behind this
|
processed by Learners is the most important parameter for the overall
|
||||||
approach is that although the total number of training window consumed by the
|
performance. It is also possible to count the processed batches and not the
|
||||||
system is the number of windows for each Learner times the number of Learners,
|
context windows, however it may be interesting to compare the influence of the
|
||||||
the Learners process their windows in parallel, thus the longest computation
|
number of the context windows in a batch (i.e.\@ the \textit{batch size}) on
|
||||||
path is as long as the number of windows that each Learner processes, which is
|
the training performance, such that e.g.\@ increasing the batch size might
|
||||||
a reasonable approximation for parallel performance. Moreover, the tests have
|
|
||||||
shown that the training steps dominate the running time (the pipeline with a
|
|
||||||
single Learner could process around 45 batches/s, but over 500 batches/s when
|
|
||||||
the call to the training function was commented out), therefore the number of
|
|
||||||
context windows processed by Learners is the most important parameter for the
|
|
||||||
overall performance. It is also possible to count the processed batches and not
|
|
||||||
the context windows, however it may be interesting to compare the influence of
|
|
||||||
the number of the context windows in a batch (i.e.\@ the \textit{batch size})
|
|
||||||
on the training performance, such that e.g.\@ increasing the batch size might
|
|
||||||
actually reduce the amount of data needed for training.
|
actually reduce the amount of data needed for training.
|
||||||
|
|
||||||
Finally, the wall time was only used as a secondary measure, since due to time
|
The wall time was only used as a secondary measure, since due to time
|
||||||
constraints and software incompatibility it was not possible to launch the
|
constraints and software incompatibility it was not possible to launch the
|
||||||
system on the computing cluster, so the tests had to be performed on a laptop
|
system on the computing cluster, so the tests had to be performed on a laptop
|
||||||
with a modest double core 1.3 GHz CPU, which means that using more than 2
|
with a modest double core 1.3 GHz CPU, which means that using more than 2
|
||||||
@@ -415,43 +411,65 @@ context windows were randomly sampled from the dump file.
|
|||||||
The test configurations were:
|
The test configurations were:
|
||||||
|
|
||||||
\begin{itemize}
|
\begin{itemize}
|
||||||
\item A single pipeline with 1, 2, 4, 8, 12, 16 Learners;
|
\item a single pipeline with 1, 2, 4, 8, 12 Learners (up to 17 total
|
||||||
\item or individual pipelines for 1, 2, 4 Learners, each reading a separate
|
processes);
|
||||||
part of a dataset.
|
\item or individual pipelines for 1, 2, 4, 8 Learners, each reading a
|
||||||
|
separate part of a dataset (up to 33 total processes).
|
||||||
\end{itemize}
|
\end{itemize}
|
||||||
|
|
||||||
For the smaller of the two datasets the target was set to 8.40, and it can be
|
For the smaller of the two datasets the target was set to \verb|8.4|, and it
|
||||||
observed in \autoref{fig:moby}, that modest speedups can be achieved
|
can be observed in \autoref{fig:datasets}, that modest speedups can be achieved
|
||||||
when going from 1 Learner to 2 or 4 learners; employing 8 Learners or more,
|
when going from 1 Learner to 2 or 4 Learners; employing 8 Learners or more,
|
||||||
however, doesn't result in any further improvement, with the system maxing out
|
however, doesn't result in any further improvement, with the system maxing out
|
||||||
on 1.6x speed up. A possible explanation for this is that the ``Moby Dick''
|
on 1.6x speed up. A possible explanation for this is that the ``Moby Dick''
|
||||||
book is too small to for the network to learn something meaningful and
|
book is too small for multiple Learners to have sufficient data to train on.
|
||||||
therefore the validation loss of 8.40 is the best that can be achieved, which
|
|
||||||
can be done fairly quickly even with one Learner node.
|
|
||||||
|
|
||||||
For the larger dataset with the target set to 8.30, however, the results were
|
For the larger dataset with the target set to \verb|8.3|, however, the results
|
||||||
more promising, as can be seen in \autoref{fig:wiki}. Using 2 Learners instead
|
were more promising, as can be seen in \autoref{fig:datasets} and
|
||||||
of 1 resulted in superlinear reduction of both the amount of data consumed by
|
\autoref{fig:speedups}. Using 2 Learners instead of 1 resulted in nearly linear
|
||||||
each Learner (2.18x) and time to target (2.14x), which cannot be trivially
|
reduction of both the amount of data consumed by each Learner (1.95x) and time
|
||||||
explained and probably has to do something with the particularities of the
|
to target (1.94x). This result also validates the use of the number of context
|
||||||
training algorithm and the training data. This result also validates the use of
|
windows consumed by each Learner as a proxy for system performance, since
|
||||||
the number of context windows consumed by each Learner as a proxy for system
|
scaling within the number of available cores results in an almost perfect
|
||||||
performance, since scaling within the number of available cores results in an
|
correlation between the amount of data per Learner and the wall time. Going
|
||||||
almost perfect correlation between the amount of data per Learner and the wall
|
from 2 to 4 Learners decreases the amount of data per Learner by another 2x,
|
||||||
time. Going from 2 to 4 Learners decreases the amount of data per Learner by
|
with the wall time remaining roughly the same, demonstrating the core depletion
|
||||||
another 1.7x, with the wall time remaining the same, demonstrating the core
|
on the laptop. Further increasing the number of Learner nodes results in
|
||||||
depletion on the laptop. Further increasing the number of learner nodes results
|
observable, but sub-linear speedups, with the 12 Learner System using 7x less
|
||||||
in observable, but sub-linear speedups, with the 12 Learner System using 7x
|
data per Learner to achieve the target loss of \verb|8.3|. This decrease in
|
||||||
less data per Learner. This decrease in gains can probably be linked to the
|
gains can probably be linked to the deficiencies of the neural network model
|
||||||
deficiencies of the neural network model being used, and thus, to achieve
|
being used, and thus, to achieve further speed-ups, the network architecture
|
||||||
further speed-ups, the network architecture has to be investigated in more
|
and training hyperparameters has to be investigated in more depth. Furthermore,
|
||||||
depth.
|
the loss plots suggest that for longer training the difference between
|
||||||
|
configurations with different number of Learners should still be observable,
|
||||||
|
however, due to time and hardware constraints it was not possible to
|
||||||
|
investigate the speed-ups achieved in longer running trials in more detail.
|
||||||
|
|
||||||
Finally, as demonstrated in \autoref{fig:moby, fig:dick}, the systems with
|
Finally, as can be observed in \autoref{fig:datasets} and
|
||||||
individual independent pipelines for each learner perform and scale worse than
|
\autoref{fig:speedups}, the systems with individual pipelines with independent
|
||||||
the single-pipeline systems. However, the trend for scaling is still visible
|
input data for each Learner initially perform and scale worse than the
|
||||||
and provides evidence that that training is possible even when non-IID
|
single-pipeline systems. However, in the later stages of training the effect of
|
||||||
heterogeneous data is available to each individual Learner.
|
using multiple pipelines becomes more positive, e.g.\@ the
|
||||||
|
\mbox{4 Learner -- 4 Pipeline} system almost catches up with the
|
||||||
|
\mbox{12 Learner -- 1 Pipeline}
|
||||||
|
system. Since input pipelines are computationally cheap, and it is
|
||||||
|
computationally viable not to store the data as one big file but rather have it
|
||||||
|
split across multiple nodes, this mode of operation should be investigated
|
||||||
|
further and possibly preferred for large-scale training.
|
||||||
|
|
||||||
|
\begin{figure}
|
||||||
|
\centering
|
||||||
|
\includegraphics[width=\linewidth]{fig/datasets.pdf}
|
||||||
|
\caption{Validation Loss Against the Amount of Data per Learner}
|
||||||
|
\label{fig:datasets}
|
||||||
|
\end{figure}
|
||||||
|
|
||||||
|
\begin{figure}
|
||||||
|
\centering
|
||||||
|
\includegraphics[width=\linewidth]{fig/speedups.pdf}
|
||||||
|
\caption{Scalability Results with the English Wikipedia Dataset}
|
||||||
|
\label{fig:speedups}
|
||||||
|
\end{figure}
|
||||||
|
|
||||||
\section{Conclusion and Future Works}
|
\section{Conclusion and Future Works}
|
||||||
|
|
||||||
@@ -465,13 +483,12 @@ two parts. The drawbacks of this approach are that the full Python interpreter
|
|||||||
still gets embedded into the C application, and, furthermore, some parts of
|
still gets embedded into the C application, and, furthermore, some parts of
|
||||||
Python, such as the \verb|multiprocessing| module, result in failures when
|
Python, such as the \verb|multiprocessing| module, result in failures when
|
||||||
embedded into a C application, which prohibits to use some Python libraries
|
embedded into a C application, which prohibits to use some Python libraries
|
||||||
like \textit{scikit-learn} or \textit{NLTK} that use \verb|multiprocessing|
|
like NLTK that use \verb|multiprocessing| internally.
|
||||||
internally.
|
|
||||||
|
|
||||||
Another major accomplishment is the creation of a modular distributed Deep
|
Another major accomplishment is the creation of a modular distributed Deep
|
||||||
Learning architecture for a basic NLP task, which can be further expanded to
|
Learning architecture for a basic NLP task, which can be further expanded to
|
||||||
compute higher level problems, like word prediction or sentiment analysis.
|
compute higher level problems, like word prediction or sentiment analysis.
|
||||||
Furthermore, this results of the tests show that there can be significant
|
Furthermore, the results of the tests show that there can be significant
|
||||||
improvements in terms of training times if the training is performed on
|
improvements in terms of training times if the training is performed on
|
||||||
multiple nodes in parallel, even with independent data on each node.
|
multiple nodes in parallel, even with independent data on each node.
|
||||||
|
|
||||||
@@ -480,7 +497,11 @@ system currently uses CPU for neural network training, which is inefficient.
|
|||||||
Therefore, it might be interesting to investigate whether MPI can be used to
|
Therefore, it might be interesting to investigate whether MPI can be used to
|
||||||
distribute the system across the cluster of GPU-equipped nodes. Furthermore,
|
distribute the system across the cluster of GPU-equipped nodes. Furthermore,
|
||||||
the architecture of the neural network probably requires some fine-tuning to
|
the architecture of the neural network probably requires some fine-tuning to
|
||||||
achieve better scalability, as reported in~\cite{fedavg}. Finally, an
|
achieve better scalability, as reported in~\cite{fedavg}. It would also be
|
||||||
|
interesting to investigate finer-grain parallelism with FG-MPI~\cite{fg-mpi},
|
||||||
|
especially for the input pipeline, since the pipeline nodes are rather too
|
||||||
|
lightweight for each of them to occupy a separate process, and therefore the
|
||||||
|
coroutine-based parallelism might be a better fit in this case. Finally, an
|
||||||
interesting direction would be to split the neural networks across multiple
|
interesting direction would be to split the neural networks across multiple
|
||||||
nodes, with one neural network layer occupying one node (e.g.\@ as
|
nodes, with one neural network layer occupying one node (e.g.\@ as
|
||||||
in~\cite{syngrad}), which might distribute the computational load across the
|
in~\cite{syngrad}), which might distribute the computational load across the
|
||||||
|
|||||||
Reference in New Issue
Block a user