even nicer plots can you beleive it
This commit is contained in:
@@ -13,12 +13,14 @@ LOGS = os.path.join(HERE, '../../docs/logs/')
|
|||||||
datasets = {
|
datasets = {
|
||||||
'moby': {
|
'moby': {
|
||||||
'idx': 0,
|
'idx': 0,
|
||||||
'name': 'Moby Dick (~200k words)',
|
'name': 'Moby Dick',
|
||||||
|
'words': '200k',
|
||||||
'target': 8.4,
|
'target': 8.4,
|
||||||
'lim': (16000, 320000)
|
'lim': (16000, 320000)
|
||||||
},
|
},
|
||||||
'wiki': {
|
'wiki': {
|
||||||
'name': 'English Wikipedia (~90M words)',
|
'name': 'English Wikipedia',
|
||||||
|
'words': '90M',
|
||||||
'idx': 1,
|
'idx': 1,
|
||||||
'target': 8.3,
|
'target': 8.3,
|
||||||
'lim': (16000, 360000)
|
'lim': (16000, 360000)
|
||||||
@@ -51,8 +53,13 @@ if __name__ == '__main__':
|
|||||||
fig = plt.figure(figsize=(10, 4))
|
fig = plt.figure(figsize=(10, 4))
|
||||||
fig.subplots_adjust(left=0.06, right=0.99, top=0.91, wspace=0.18)
|
fig.subplots_adjust(left=0.06, right=0.99, top=0.91, wspace=0.18)
|
||||||
axs = fig.subplots(1, len(datasets))
|
axs = fig.subplots(1, len(datasets))
|
||||||
pp_speedup = []
|
pp_speedup = {
|
||||||
l_speedup = []
|
'wiki': [],
|
||||||
|
}
|
||||||
|
l_speedup = {
|
||||||
|
'moby': [],
|
||||||
|
'wiki': [],
|
||||||
|
}
|
||||||
|
|
||||||
for fn in files:
|
for fn in files:
|
||||||
name, learners, pipelines = meta_from_fn(fn)
|
name, learners, pipelines = meta_from_fn(fn)
|
||||||
@@ -74,11 +81,10 @@ if __name__ == '__main__':
|
|||||||
f' {pipelines} Pipeline{s(pipelines)}'
|
f' {pipelines} Pipeline{s(pipelines)}'
|
||||||
)
|
)
|
||||||
ttt = windows[idx_of(loss, lambda l: l < datasets[name]['target'])]
|
ttt = windows[idx_of(loss, lambda l: l < datasets[name]['target'])]
|
||||||
if name == 'wiki':
|
if (pipelines > 1 or learners == 1) and name == 'wiki':
|
||||||
if pipelines > 1 or learners == 1:
|
pp_speedup[name].append((pipelines, ttt))
|
||||||
pp_speedup.append((pipelines, ttt))
|
if pipelines == 1:
|
||||||
if pipelines == 1:
|
l_speedup[name].append((learners, ttt))
|
||||||
l_speedup.append((learners, ttt))
|
|
||||||
|
|
||||||
for d in datasets.values():
|
for d in datasets.values():
|
||||||
a = axs[d['idx']]
|
a = axs[d['idx']]
|
||||||
@@ -86,22 +92,38 @@ if __name__ == '__main__':
|
|||||||
a.set_ylabel('Validation Loss')
|
a.set_ylabel('Validation Loss')
|
||||||
a.set_xticks([windows[1]] + [*range(0, 300001, 100000)])
|
a.set_xticks([windows[1]] + [*range(0, 300001, 100000)])
|
||||||
a.set_xlim(*d['lim'])
|
a.set_xlim(*d['lim'])
|
||||||
a.set_title(d['name'])
|
a.set_title(f'{d["name"]} (~{d["words"]} words)')
|
||||||
a.legend()
|
a.legend()
|
||||||
a.axhline(d['target'], color='k', linestyle=':')
|
a.axhline(d['target'], color='k', linestyle=':')
|
||||||
|
|
||||||
fig.savefig(os.path.join(HERE, 'fig/datasets.pdf'))
|
fig.savefig(os.path.join(HERE, 'fig/datasets.pdf'))
|
||||||
|
|
||||||
def speedup_plot(zipped):
|
def speedup_plot(zipped):
|
||||||
factors, time = zip(*sorted(zipped))
|
min_f = []
|
||||||
time = np.asarray(time)
|
max_f = []
|
||||||
speedup = time[0] / time
|
min_s = []
|
||||||
print(factors, time)
|
max_s = []
|
||||||
plt.plot(factors, speedup)
|
for z in sorted(zipped, key=lambda x: datasets[x]['idx']):
|
||||||
plt.xlim(min(factors), max(factors))
|
d = datasets[z]
|
||||||
plt.ylim(min(speedup), max(speedup))
|
factors, time = zip(*sorted(zipped[z]))
|
||||||
plt.xticks([*range(min(factors), max(factors) + 1)])
|
time = np.asarray(time)
|
||||||
plt.yticks([*range(floor(min(speedup)), ceil(max(speedup)) + 1)])
|
speedup = time[0] / time
|
||||||
|
print(factors, time)
|
||||||
|
plt.plot(
|
||||||
|
factors, speedup,
|
||||||
|
label=f'{d["name"]}, target: {d["target"]}',
|
||||||
|
color=f'C{d["idx"]}'
|
||||||
|
)
|
||||||
|
min_s.append(min(speedup))
|
||||||
|
max_s.append(max(speedup))
|
||||||
|
min_f.append(min(factors))
|
||||||
|
max_f.append(max(factors))
|
||||||
|
|
||||||
|
plt.xlim(min(min_f), max(max_f))
|
||||||
|
plt.ylim(min(min_s), max(max_s))
|
||||||
|
plt.xticks([*range(min(min_f), max(max_f) + 1)])
|
||||||
|
plt.yticks([*range(floor(min(min_s)), ceil(max(max_s)) + 1)])
|
||||||
|
plt.legend(loc='upper left')
|
||||||
plt.grid()
|
plt.grid()
|
||||||
|
|
||||||
fig = plt.figure(figsize=(10, 4))
|
fig = plt.figure(figsize=(10, 4))
|
||||||
@@ -111,13 +133,13 @@ if __name__ == '__main__':
|
|||||||
speedup_plot(l_speedup)
|
speedup_plot(l_speedup)
|
||||||
plt.title('Single Pipeline')
|
plt.title('Single Pipeline')
|
||||||
plt.xlabel('Number of Learners')
|
plt.xlabel('Number of Learners')
|
||||||
plt.ylabel(f'Speedup to Target {datasets["wiki"]["target"]}')
|
plt.ylabel(f'Speedup to Target')
|
||||||
|
|
||||||
plt.subplot(122)
|
plt.subplot(122)
|
||||||
speedup_plot(pp_speedup)
|
speedup_plot(pp_speedup)
|
||||||
plt.title('Multiple Pipelines')
|
plt.title('Multiple Pipelines')
|
||||||
plt.xlabel('Number of Pipelines')
|
plt.xlabel('Number of Pipelines')
|
||||||
plt.ylabel(f'Speedup to Target {datasets["wiki"]["target"]}')
|
plt.ylabel(f'Speedup to Target')
|
||||||
|
|
||||||
plt.savefig(os.path.join(HERE, 'fig/speedups.pdf'))
|
plt.savefig(os.path.join(HERE, 'fig/speedups.pdf'))
|
||||||
plt.show()
|
plt.show()
|
||||||
|
|||||||
@@ -376,15 +376,15 @@ total number of training windows consumed by the system is the number of
|
|||||||
windows for each Learner times the number of Learners, the Learners process
|
windows for each Learner times the number of Learners, the Learners process
|
||||||
their windows in parallel, thus the longest computation path is as long as the
|
their windows in parallel, thus the longest computation path is as long as the
|
||||||
number of windows that each Learner processes, which is a reasonable
|
number of windows that each Learner processes, which is a reasonable
|
||||||
approximation for parallel performance. Moreover, the tests have shown that the
|
approximation for parallel performance. Moreover, the tests have shown that
|
||||||
training steps dominate the running time (the pipeline with a single Learner
|
Learners dominate the running time (the pipeline with a single Learner could
|
||||||
could process around 45 batches/s, but over 500 batches/s when the call to the
|
process around 45 batches/s, but over 500 batches/s when the call to the
|
||||||
training function was commented out), therefore the number of context windows
|
training function in the Learner was commented out), therefore the number of
|
||||||
processed by Learners is the most important parameter for the overall
|
context windows processed by Learners is the most important parameter for the
|
||||||
performance. It is also possible to count the processed batches and not the
|
overall performance. It is also possible to count the processed batches and not
|
||||||
context windows, however it may be interesting to compare the influence of the
|
the context windows, however it may be interesting to compare the influence of
|
||||||
number of the context windows in a batch (i.e.\@ the \textit{batch size}) on
|
the number of the context windows in a batch (i.e.\@ the \textit{batch size})
|
||||||
the training performance, such that e.g.\@ increasing the batch size might
|
on the training performance, such that e.g.\@ increasing the batch size might
|
||||||
actually reduce the amount of data needed for training.
|
actually reduce the amount of data needed for training.
|
||||||
|
|
||||||
The wall time was only used as a secondary measure, since due to time
|
The wall time was only used as a secondary measure, since due to time
|
||||||
@@ -405,7 +405,7 @@ Another dataset was a part of a recent English Wikipedia dump~\cite{wikidump}
|
|||||||
(approx.\@ 90M words), which was transformed into plain text using the
|
(approx.\@ 90M words), which was transformed into plain text using the
|
||||||
WikiExtractor~\cite{wikiextractor} tool. For this dataset the vocabulary is the
|
WikiExtractor~\cite{wikiextractor} tool. For this dataset the vocabulary is the
|
||||||
list of 10000 most frequently used English words, obtained
|
list of 10000 most frequently used English words, obtained
|
||||||
from~\cite{10k-words}, again, excluding the stop words. As a test data, 5000
|
from~\cite{10k-words}, also excluding the stop words. As a test data, 5000
|
||||||
context windows were randomly sampled from the dump file.
|
context windows were randomly sampled from the dump file.
|
||||||
|
|
||||||
The test configurations were:
|
The test configurations were:
|
||||||
@@ -419,10 +419,11 @@ The test configurations were:
|
|||||||
|
|
||||||
For the smaller of the two datasets the target was set to \verb|8.4|, and it
|
For the smaller of the two datasets the target was set to \verb|8.4|, and it
|
||||||
can be observed in \autoref{fig:datasets}, that modest speedups can be achieved
|
can be observed in \autoref{fig:datasets}, that modest speedups can be achieved
|
||||||
when going from 1 Learner to 2 or 4 Learners; employing 8 Learners or more,
|
by employing up to 8 Learners, with the system maxing out on 2.4x speed-up.
|
||||||
however, doesn't result in any further improvement, with the system maxing out
|
Furthermore, a \mbox{2 Learner -- 2 Pipeline} configuration training
|
||||||
on 1.6x speed up. A possible explanation for this is that the ``Moby Dick''
|
independently on two different halves of the book never even reaches the
|
||||||
book is too small for multiple Learners to have sufficient data to train on.
|
target. A possible explanation for this is that the ``Moby Dick'' book is too
|
||||||
|
small for multiple Learners to have sufficient data to train on.
|
||||||
|
|
||||||
For the larger dataset with the target set to \verb|8.3|, however, the results
|
For the larger dataset with the target set to \verb|8.3|, however, the results
|
||||||
were more promising, as can be seen in \autoref{fig:datasets} and
|
were more promising, as can be seen in \autoref{fig:datasets} and
|
||||||
@@ -439,19 +440,18 @@ observable, but sub-linear speedups, with the 12 Learner System using 7x less
|
|||||||
data per Learner to achieve the target loss of \verb|8.3|. This decrease in
|
data per Learner to achieve the target loss of \verb|8.3|. This decrease in
|
||||||
gains can probably be linked to the deficiencies of the neural network model
|
gains can probably be linked to the deficiencies of the neural network model
|
||||||
being used, and thus, to achieve further speed-ups, the network architecture
|
being used, and thus, to achieve further speed-ups, the network architecture
|
||||||
and training hyperparameters has to be investigated in more depth. Furthermore,
|
and training hyperparameters have to be investigated in more depth.
|
||||||
the loss plots suggest that for longer training the difference between
|
Furthermore, the loss plots suggest that for longer training the difference
|
||||||
configurations with different number of Learners should still be observable,
|
between configurations with different number of Learners should still be
|
||||||
however, due to time and hardware constraints it was not possible to
|
observable, however, due to time and hardware constraints it was not possible
|
||||||
investigate the speed-ups achieved in longer running trials in more detail.
|
to investigate the speed-ups achieved in longer running trials in more detail.
|
||||||
|
|
||||||
Finally, as can be observed in \autoref{fig:datasets} and
|
Finally, as can be observed in \autoref{fig:datasets} and
|
||||||
\autoref{fig:speedups}, the systems with individual pipelines with independent
|
\autoref{fig:speedups}, the systems with individual pipelines with independent
|
||||||
input data for each Learner initially perform and scale worse than the
|
input data for each Learner initially perform and scale worse than the
|
||||||
single-pipeline systems. However, in the later stages of training the effect of
|
single-pipeline systems. However, in the later stages of training the effect of
|
||||||
using multiple pipelines becomes more positive, e.g.\@ the
|
using multiple pipelines becomes more positive, e.g.\@ the \mbox{4 Learner -- 4
|
||||||
\mbox{4 Learner -- 4 Pipeline} system almost catches up with the
|
Pipeline} system almost catches up with the \mbox{12 Learner -- 1 Pipeline}
|
||||||
\mbox{12 Learner -- 1 Pipeline}
|
|
||||||
system. Since input pipelines are computationally cheap, and it is
|
system. Since input pipelines are computationally cheap, and it is
|
||||||
computationally viable not to store the data as one big file but rather have it
|
computationally viable not to store the data as one big file but rather have it
|
||||||
split across multiple nodes, this mode of operation should be investigated
|
split across multiple nodes, this mode of operation should be investigated
|
||||||
@@ -467,7 +467,7 @@ further and possibly preferred for large-scale training.
|
|||||||
\begin{figure}
|
\begin{figure}
|
||||||
\centering
|
\centering
|
||||||
\includegraphics[width=\linewidth]{fig/speedups.pdf}
|
\includegraphics[width=\linewidth]{fig/speedups.pdf}
|
||||||
\caption{Scalability Results with the English Wikipedia Dataset}
|
\caption{Scalability}
|
||||||
\label{fig:speedups}
|
\label{fig:speedups}
|
||||||
\end{figure}
|
\end{figure}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user