even nicer plots can you beleive it

This commit is contained in:
2019-12-15 21:05:11 -08:00
parent 670f69e0df
commit 24ca380cbf
2 changed files with 66 additions and 44 deletions

View File

@@ -13,12 +13,14 @@ LOGS = os.path.join(HERE, '../../docs/logs/')
datasets = {
'moby': {
'idx': 0,
'name': 'Moby Dick (~200k words)',
'name': 'Moby Dick',
'words': '200k',
'target': 8.4,
'lim': (16000, 320000)
},
'wiki': {
'name': 'English Wikipedia (~90M words)',
'name': 'English Wikipedia',
'words': '90M',
'idx': 1,
'target': 8.3,
'lim': (16000, 360000)
@@ -51,8 +53,13 @@ if __name__ == '__main__':
fig = plt.figure(figsize=(10, 4))
fig.subplots_adjust(left=0.06, right=0.99, top=0.91, wspace=0.18)
axs = fig.subplots(1, len(datasets))
pp_speedup = []
l_speedup = []
pp_speedup = {
'wiki': [],
}
l_speedup = {
'moby': [],
'wiki': [],
}
for fn in files:
name, learners, pipelines = meta_from_fn(fn)
@@ -74,11 +81,10 @@ if __name__ == '__main__':
f' {pipelines} Pipeline{s(pipelines)}'
)
ttt = windows[idx_of(loss, lambda l: l < datasets[name]['target'])]
if name == 'wiki':
if pipelines > 1 or learners == 1:
pp_speedup.append((pipelines, ttt))
if pipelines == 1:
l_speedup.append((learners, ttt))
if (pipelines > 1 or learners == 1) and name == 'wiki':
pp_speedup[name].append((pipelines, ttt))
if pipelines == 1:
l_speedup[name].append((learners, ttt))
for d in datasets.values():
a = axs[d['idx']]
@@ -86,22 +92,38 @@ if __name__ == '__main__':
a.set_ylabel('Validation Loss')
a.set_xticks([windows[1]] + [*range(0, 300001, 100000)])
a.set_xlim(*d['lim'])
a.set_title(d['name'])
a.set_title(f'{d["name"]} (~{d["words"]} words)')
a.legend()
a.axhline(d['target'], color='k', linestyle=':')
fig.savefig(os.path.join(HERE, 'fig/datasets.pdf'))
def speedup_plot(zipped):
factors, time = zip(*sorted(zipped))
time = np.asarray(time)
speedup = time[0] / time
print(factors, time)
plt.plot(factors, speedup)
plt.xlim(min(factors), max(factors))
plt.ylim(min(speedup), max(speedup))
plt.xticks([*range(min(factors), max(factors) + 1)])
plt.yticks([*range(floor(min(speedup)), ceil(max(speedup)) + 1)])
min_f = []
max_f = []
min_s = []
max_s = []
for z in sorted(zipped, key=lambda x: datasets[x]['idx']):
d = datasets[z]
factors, time = zip(*sorted(zipped[z]))
time = np.asarray(time)
speedup = time[0] / time
print(factors, time)
plt.plot(
factors, speedup,
label=f'{d["name"]}, target: {d["target"]}',
color=f'C{d["idx"]}'
)
min_s.append(min(speedup))
max_s.append(max(speedup))
min_f.append(min(factors))
max_f.append(max(factors))
plt.xlim(min(min_f), max(max_f))
plt.ylim(min(min_s), max(max_s))
plt.xticks([*range(min(min_f), max(max_f) + 1)])
plt.yticks([*range(floor(min(min_s)), ceil(max(max_s)) + 1)])
plt.legend(loc='upper left')
plt.grid()
fig = plt.figure(figsize=(10, 4))
@@ -111,13 +133,13 @@ if __name__ == '__main__':
speedup_plot(l_speedup)
plt.title('Single Pipeline')
plt.xlabel('Number of Learners')
plt.ylabel(f'Speedup to Target {datasets["wiki"]["target"]}')
plt.ylabel(f'Speedup to Target')
plt.subplot(122)
speedup_plot(pp_speedup)
plt.title('Multiple Pipelines')
plt.xlabel('Number of Pipelines')
plt.ylabel(f'Speedup to Target {datasets["wiki"]["target"]}')
plt.ylabel(f'Speedup to Target')
plt.savefig(os.path.join(HERE, 'fig/speedups.pdf'))
plt.show()

View File

@@ -376,15 +376,15 @@ total number of training windows consumed by the system is the number of
windows for each Learner times the number of Learners, the Learners process
their windows in parallel, thus the longest computation path is as long as the
number of windows that each Learner processes, which is a reasonable
approximation for parallel performance. Moreover, the tests have shown that the
training steps dominate the running time (the pipeline with a single Learner
could process around 45 batches/s, but over 500 batches/s when the call to the
training function was commented out), therefore the number of context windows
processed by Learners is the most important parameter for the overall
performance. It is also possible to count the processed batches and not the
context windows, however it may be interesting to compare the influence of the
number of the context windows in a batch (i.e.\@ the \textit{batch size}) on
the training performance, such that e.g.\@ increasing the batch size might
approximation for parallel performance. Moreover, the tests have shown that
Learners dominate the running time (the pipeline with a single Learner could
process around 45 batches/s, but over 500 batches/s when the call to the
training function in the Learner was commented out), therefore the number of
context windows processed by Learners is the most important parameter for the
overall performance. It is also possible to count the processed batches and not
the context windows, however it may be interesting to compare the influence of
the number of the context windows in a batch (i.e.\@ the \textit{batch size})
on the training performance, such that e.g.\@ increasing the batch size might
actually reduce the amount of data needed for training.
The wall time was only used as a secondary measure, since due to time
@@ -405,7 +405,7 @@ Another dataset was a part of a recent English Wikipedia dump~\cite{wikidump}
(approx.\@ 90M words), which was transformed into plain text using the
WikiExtractor~\cite{wikiextractor} tool. For this dataset the vocabulary is the
list of 10000 most frequently used English words, obtained
from~\cite{10k-words}, again, excluding the stop words. As a test data, 5000
from~\cite{10k-words}, also excluding the stop words. As a test data, 5000
context windows were randomly sampled from the dump file.
The test configurations were:
@@ -419,10 +419,11 @@ The test configurations were:
For the smaller of the two datasets the target was set to \verb|8.4|, and it
can be observed in \autoref{fig:datasets}, that modest speedups can be achieved
when going from 1 Learner to 2 or 4 Learners; employing 8 Learners or more,
however, doesn't result in any further improvement, with the system maxing out
on 1.6x speed up. A possible explanation for this is that the ``Moby Dick''
book is too small for multiple Learners to have sufficient data to train on.
by employing up to 8 Learners, with the system maxing out on 2.4x speed-up.
Furthermore, a \mbox{2 Learner -- 2 Pipeline} configuration training
independently on two different halves of the book never even reaches the
target. A possible explanation for this is that the ``Moby Dick'' book is too
small for multiple Learners to have sufficient data to train on.
For the larger dataset with the target set to \verb|8.3|, however, the results
were more promising, as can be seen in \autoref{fig:datasets} and
@@ -439,19 +440,18 @@ observable, but sub-linear speedups, with the 12 Learner System using 7x less
data per Learner to achieve the target loss of \verb|8.3|. This decrease in
gains can probably be linked to the deficiencies of the neural network model
being used, and thus, to achieve further speed-ups, the network architecture
and training hyperparameters has to be investigated in more depth. Furthermore,
the loss plots suggest that for longer training the difference between
configurations with different number of Learners should still be observable,
however, due to time and hardware constraints it was not possible to
investigate the speed-ups achieved in longer running trials in more detail.
and training hyperparameters have to be investigated in more depth.
Furthermore, the loss plots suggest that for longer training the difference
between configurations with different number of Learners should still be
observable, however, due to time and hardware constraints it was not possible
to investigate the speed-ups achieved in longer running trials in more detail.
Finally, as can be observed in \autoref{fig:datasets} and
\autoref{fig:speedups}, the systems with individual pipelines with independent
input data for each Learner initially perform and scale worse than the
single-pipeline systems. However, in the later stages of training the effect of
using multiple pipelines becomes more positive, e.g.\@ the
\mbox{4 Learner -- 4 Pipeline} system almost catches up with the
\mbox{12 Learner -- 1 Pipeline}
using multiple pipelines becomes more positive, e.g.\@ the \mbox{4 Learner -- 4
Pipeline} system almost catches up with the \mbox{12 Learner -- 1 Pipeline}
system. Since input pipelines are computationally cheap, and it is
computationally viable not to store the data as one big file but rather have it
split across multiple nodes, this mode of operation should be investigated
@@ -467,7 +467,7 @@ further and possibly preferred for large-scale training.
\begin{figure}
\centering
\includegraphics[width=\linewidth]{fig/speedups.pdf}
\caption{Scalability Results with the English Wikipedia Dataset}
\caption{Scalability}
\label{fig:speedups}
\end{figure}