even nicer plots can you beleive it

This commit is contained in:
2019-12-15 21:05:11 -08:00
parent 670f69e0df
commit 24ca380cbf
2 changed files with 66 additions and 44 deletions

View File

@@ -13,12 +13,14 @@ LOGS = os.path.join(HERE, '../../docs/logs/')
datasets = { datasets = {
'moby': { 'moby': {
'idx': 0, 'idx': 0,
'name': 'Moby Dick (~200k words)', 'name': 'Moby Dick',
'words': '200k',
'target': 8.4, 'target': 8.4,
'lim': (16000, 320000) 'lim': (16000, 320000)
}, },
'wiki': { 'wiki': {
'name': 'English Wikipedia (~90M words)', 'name': 'English Wikipedia',
'words': '90M',
'idx': 1, 'idx': 1,
'target': 8.3, 'target': 8.3,
'lim': (16000, 360000) 'lim': (16000, 360000)
@@ -51,8 +53,13 @@ if __name__ == '__main__':
fig = plt.figure(figsize=(10, 4)) fig = plt.figure(figsize=(10, 4))
fig.subplots_adjust(left=0.06, right=0.99, top=0.91, wspace=0.18) fig.subplots_adjust(left=0.06, right=0.99, top=0.91, wspace=0.18)
axs = fig.subplots(1, len(datasets)) axs = fig.subplots(1, len(datasets))
pp_speedup = [] pp_speedup = {
l_speedup = [] 'wiki': [],
}
l_speedup = {
'moby': [],
'wiki': [],
}
for fn in files: for fn in files:
name, learners, pipelines = meta_from_fn(fn) name, learners, pipelines = meta_from_fn(fn)
@@ -74,11 +81,10 @@ if __name__ == '__main__':
f' {pipelines} Pipeline{s(pipelines)}' f' {pipelines} Pipeline{s(pipelines)}'
) )
ttt = windows[idx_of(loss, lambda l: l < datasets[name]['target'])] ttt = windows[idx_of(loss, lambda l: l < datasets[name]['target'])]
if name == 'wiki': if (pipelines > 1 or learners == 1) and name == 'wiki':
if pipelines > 1 or learners == 1: pp_speedup[name].append((pipelines, ttt))
pp_speedup.append((pipelines, ttt)) if pipelines == 1:
if pipelines == 1: l_speedup[name].append((learners, ttt))
l_speedup.append((learners, ttt))
for d in datasets.values(): for d in datasets.values():
a = axs[d['idx']] a = axs[d['idx']]
@@ -86,22 +92,38 @@ if __name__ == '__main__':
a.set_ylabel('Validation Loss') a.set_ylabel('Validation Loss')
a.set_xticks([windows[1]] + [*range(0, 300001, 100000)]) a.set_xticks([windows[1]] + [*range(0, 300001, 100000)])
a.set_xlim(*d['lim']) a.set_xlim(*d['lim'])
a.set_title(d['name']) a.set_title(f'{d["name"]} (~{d["words"]} words)')
a.legend() a.legend()
a.axhline(d['target'], color='k', linestyle=':') a.axhline(d['target'], color='k', linestyle=':')
fig.savefig(os.path.join(HERE, 'fig/datasets.pdf')) fig.savefig(os.path.join(HERE, 'fig/datasets.pdf'))
def speedup_plot(zipped): def speedup_plot(zipped):
factors, time = zip(*sorted(zipped)) min_f = []
time = np.asarray(time) max_f = []
speedup = time[0] / time min_s = []
print(factors, time) max_s = []
plt.plot(factors, speedup) for z in sorted(zipped, key=lambda x: datasets[x]['idx']):
plt.xlim(min(factors), max(factors)) d = datasets[z]
plt.ylim(min(speedup), max(speedup)) factors, time = zip(*sorted(zipped[z]))
plt.xticks([*range(min(factors), max(factors) + 1)]) time = np.asarray(time)
plt.yticks([*range(floor(min(speedup)), ceil(max(speedup)) + 1)]) speedup = time[0] / time
print(factors, time)
plt.plot(
factors, speedup,
label=f'{d["name"]}, target: {d["target"]}',
color=f'C{d["idx"]}'
)
min_s.append(min(speedup))
max_s.append(max(speedup))
min_f.append(min(factors))
max_f.append(max(factors))
plt.xlim(min(min_f), max(max_f))
plt.ylim(min(min_s), max(max_s))
plt.xticks([*range(min(min_f), max(max_f) + 1)])
plt.yticks([*range(floor(min(min_s)), ceil(max(max_s)) + 1)])
plt.legend(loc='upper left')
plt.grid() plt.grid()
fig = plt.figure(figsize=(10, 4)) fig = plt.figure(figsize=(10, 4))
@@ -111,13 +133,13 @@ if __name__ == '__main__':
speedup_plot(l_speedup) speedup_plot(l_speedup)
plt.title('Single Pipeline') plt.title('Single Pipeline')
plt.xlabel('Number of Learners') plt.xlabel('Number of Learners')
plt.ylabel(f'Speedup to Target {datasets["wiki"]["target"]}') plt.ylabel(f'Speedup to Target')
plt.subplot(122) plt.subplot(122)
speedup_plot(pp_speedup) speedup_plot(pp_speedup)
plt.title('Multiple Pipelines') plt.title('Multiple Pipelines')
plt.xlabel('Number of Pipelines') plt.xlabel('Number of Pipelines')
plt.ylabel(f'Speedup to Target {datasets["wiki"]["target"]}') plt.ylabel(f'Speedup to Target')
plt.savefig(os.path.join(HERE, 'fig/speedups.pdf')) plt.savefig(os.path.join(HERE, 'fig/speedups.pdf'))
plt.show() plt.show()

View File

@@ -376,15 +376,15 @@ total number of training windows consumed by the system is the number of
windows for each Learner times the number of Learners, the Learners process windows for each Learner times the number of Learners, the Learners process
their windows in parallel, thus the longest computation path is as long as the their windows in parallel, thus the longest computation path is as long as the
number of windows that each Learner processes, which is a reasonable number of windows that each Learner processes, which is a reasonable
approximation for parallel performance. Moreover, the tests have shown that the approximation for parallel performance. Moreover, the tests have shown that
training steps dominate the running time (the pipeline with a single Learner Learners dominate the running time (the pipeline with a single Learner could
could process around 45 batches/s, but over 500 batches/s when the call to the process around 45 batches/s, but over 500 batches/s when the call to the
training function was commented out), therefore the number of context windows training function in the Learner was commented out), therefore the number of
processed by Learners is the most important parameter for the overall context windows processed by Learners is the most important parameter for the
performance. It is also possible to count the processed batches and not the overall performance. It is also possible to count the processed batches and not
context windows, however it may be interesting to compare the influence of the the context windows, however it may be interesting to compare the influence of
number of the context windows in a batch (i.e.\@ the \textit{batch size}) on the number of the context windows in a batch (i.e.\@ the \textit{batch size})
the training performance, such that e.g.\@ increasing the batch size might on the training performance, such that e.g.\@ increasing the batch size might
actually reduce the amount of data needed for training. actually reduce the amount of data needed for training.
The wall time was only used as a secondary measure, since due to time The wall time was only used as a secondary measure, since due to time
@@ -405,7 +405,7 @@ Another dataset was a part of a recent English Wikipedia dump~\cite{wikidump}
(approx.\@ 90M words), which was transformed into plain text using the (approx.\@ 90M words), which was transformed into plain text using the
WikiExtractor~\cite{wikiextractor} tool. For this dataset the vocabulary is the WikiExtractor~\cite{wikiextractor} tool. For this dataset the vocabulary is the
list of 10000 most frequently used English words, obtained list of 10000 most frequently used English words, obtained
from~\cite{10k-words}, again, excluding the stop words. As a test data, 5000 from~\cite{10k-words}, also excluding the stop words. As a test data, 5000
context windows were randomly sampled from the dump file. context windows were randomly sampled from the dump file.
The test configurations were: The test configurations were:
@@ -419,10 +419,11 @@ The test configurations were:
For the smaller of the two datasets the target was set to \verb|8.4|, and it For the smaller of the two datasets the target was set to \verb|8.4|, and it
can be observed in \autoref{fig:datasets}, that modest speedups can be achieved can be observed in \autoref{fig:datasets}, that modest speedups can be achieved
when going from 1 Learner to 2 or 4 Learners; employing 8 Learners or more, by employing up to 8 Learners, with the system maxing out on 2.4x speed-up.
however, doesn't result in any further improvement, with the system maxing out Furthermore, a \mbox{2 Learner -- 2 Pipeline} configuration training
on 1.6x speed up. A possible explanation for this is that the ``Moby Dick'' independently on two different halves of the book never even reaches the
book is too small for multiple Learners to have sufficient data to train on. target. A possible explanation for this is that the ``Moby Dick'' book is too
small for multiple Learners to have sufficient data to train on.
For the larger dataset with the target set to \verb|8.3|, however, the results For the larger dataset with the target set to \verb|8.3|, however, the results
were more promising, as can be seen in \autoref{fig:datasets} and were more promising, as can be seen in \autoref{fig:datasets} and
@@ -439,19 +440,18 @@ observable, but sub-linear speedups, with the 12 Learner System using 7x less
data per Learner to achieve the target loss of \verb|8.3|. This decrease in data per Learner to achieve the target loss of \verb|8.3|. This decrease in
gains can probably be linked to the deficiencies of the neural network model gains can probably be linked to the deficiencies of the neural network model
being used, and thus, to achieve further speed-ups, the network architecture being used, and thus, to achieve further speed-ups, the network architecture
and training hyperparameters has to be investigated in more depth. Furthermore, and training hyperparameters have to be investigated in more depth.
the loss plots suggest that for longer training the difference between Furthermore, the loss plots suggest that for longer training the difference
configurations with different number of Learners should still be observable, between configurations with different number of Learners should still be
however, due to time and hardware constraints it was not possible to observable, however, due to time and hardware constraints it was not possible
investigate the speed-ups achieved in longer running trials in more detail. to investigate the speed-ups achieved in longer running trials in more detail.
Finally, as can be observed in \autoref{fig:datasets} and Finally, as can be observed in \autoref{fig:datasets} and
\autoref{fig:speedups}, the systems with individual pipelines with independent \autoref{fig:speedups}, the systems with individual pipelines with independent
input data for each Learner initially perform and scale worse than the input data for each Learner initially perform and scale worse than the
single-pipeline systems. However, in the later stages of training the effect of single-pipeline systems. However, in the later stages of training the effect of
using multiple pipelines becomes more positive, e.g.\@ the using multiple pipelines becomes more positive, e.g.\@ the \mbox{4 Learner -- 4
\mbox{4 Learner -- 4 Pipeline} system almost catches up with the Pipeline} system almost catches up with the \mbox{12 Learner -- 1 Pipeline}
\mbox{12 Learner -- 1 Pipeline}
system. Since input pipelines are computationally cheap, and it is system. Since input pipelines are computationally cheap, and it is
computationally viable not to store the data as one big file but rather have it computationally viable not to store the data as one big file but rather have it
split across multiple nodes, this mode of operation should be investigated split across multiple nodes, this mode of operation should be investigated
@@ -467,7 +467,7 @@ further and possibly preferred for large-scale training.
\begin{figure} \begin{figure}
\centering \centering
\includegraphics[width=\linewidth]{fig/speedups.pdf} \includegraphics[width=\linewidth]{fig/speedups.pdf}
\caption{Scalability Results with the English Wikipedia Dataset} \caption{Scalability}
\label{fig:speedups} \label{fig:speedups}
\end{figure} \end{figure}