even nicer plots can you beleive it

2019-12-15 21:05:11 -08:00
parent 670f69e0df
commit 24ca380cbf
2 changed files with 66 additions and 44 deletions
--- a/docs/generate_plots.py
+++ b/docs/generate_plots.py
@@ -13,12 +13,14 @@ LOGS = os.path.join(HERE, '../../docs/logs/')
 datasets = {
    'moby': {
        'idx': 0,
-        'name': 'Moby Dick (~200k words)',
+        'name': 'Moby Dick',
        'words': '200k',
        'target': 8.4,
        'lim': (16000, 320000)
    },
    'wiki': {
-        'name': 'English Wikipedia (~90M words)',
+        'name': 'English Wikipedia',
        'words': '90M',
        'idx': 1,
        'target': 8.3,
        'lim': (16000, 360000)
@@ -51,8 +53,13 @@ if __name__ == '__main__':
    fig = plt.figure(figsize=(10, 4))
    fig.subplots_adjust(left=0.06, right=0.99, top=0.91, wspace=0.18)
    axs = fig.subplots(1, len(datasets))
-    pp_speedup = []
+    pp_speedup = {
-    l_speedup = []
+        'wiki': [],
    }
    l_speedup = {
        'moby': [],
        'wiki': [],
    }
    for fn in files:
        name, learners, pipelines = meta_from_fn(fn)
@@ -74,11 +81,10 @@ if __name__ == '__main__':
            f' {pipelines} Pipeline{s(pipelines)}'
        )
        ttt = windows[idx_of(loss, lambda l: l < datasets[name]['target'])]
-        if name == 'wiki':
+        if (pipelines > 1 or learners == 1) and name == 'wiki':
-            if pipelines > 1 or learners == 1:
+            pp_speedup[name].append((pipelines, ttt))
-                pp_speedup.append((pipelines, ttt))
+        if pipelines == 1:
-            if pipelines == 1:
+            l_speedup[name].append((learners, ttt))
                l_speedup.append((learners, ttt))
    for d in datasets.values():
        a = axs[d['idx']]
@@ -86,22 +92,38 @@ if __name__ == '__main__':
        a.set_ylabel('Validation Loss')
        a.set_xticks([windows[1]] + [*range(0, 300001, 100000)])
        a.set_xlim(*d['lim'])
-        a.set_title(d['name'])
+        a.set_title(f'{d["name"]} (~{d["words"]} words)')
        a.legend()
        a.axhline(d['target'], color='k', linestyle=':')
    fig.savefig(os.path.join(HERE, 'fig/datasets.pdf'))
    def speedup_plot(zipped):
-        factors, time = zip(*sorted(zipped))
+        min_f = []
-        time = np.asarray(time)
+        max_f = []
-        speedup = time[0] / time
+        min_s = []
-        print(factors, time)
+        max_s = []
-        plt.plot(factors, speedup)
+        for z in sorted(zipped, key=lambda x: datasets[x]['idx']):
-        plt.xlim(min(factors), max(factors))
+            d = datasets[z]
-        plt.ylim(min(speedup), max(speedup))
+            factors, time = zip(*sorted(zipped[z]))
-        plt.xticks([*range(min(factors), max(factors) + 1)])
+            time = np.asarray(time)
-        plt.yticks([*range(floor(min(speedup)), ceil(max(speedup)) + 1)])
+            speedup = time[0] / time
            print(factors, time)
            plt.plot(
                factors, speedup,
                label=f'{d["name"]}, target: {d["target"]}',
                color=f'C{d["idx"]}'
            )
            min_s.append(min(speedup))
            max_s.append(max(speedup))
            min_f.append(min(factors))
            max_f.append(max(factors))
        plt.xlim(min(min_f), max(max_f))
        plt.ylim(min(min_s), max(max_s))
        plt.xticks([*range(min(min_f), max(max_f) + 1)])
        plt.yticks([*range(floor(min(min_s)), ceil(max(max_s)) + 1)])
        plt.legend(loc='upper left')
        plt.grid()
    fig = plt.figure(figsize=(10, 4))
@@ -111,13 +133,13 @@ if __name__ == '__main__':
    speedup_plot(l_speedup)
    plt.title('Single Pipeline')
    plt.xlabel('Number of Learners')
-    plt.ylabel(f'Speedup to Target {datasets["wiki"]["target"]}')
+    plt.ylabel(f'Speedup to Target')
    plt.subplot(122)
    speedup_plot(pp_speedup)
    plt.title('Multiple Pipelines')
    plt.xlabel('Number of Pipelines')
-    plt.ylabel(f'Speedup to Target {datasets["wiki"]["target"]}')
+    plt.ylabel(f'Speedup to Target')
    plt.savefig(os.path.join(HERE, 'fig/speedups.pdf'))
    plt.show()
--- a/docs/report.latex
+++ b/docs/report.latex
@@ -376,15 +376,15 @@ total number of training windows consumed by the system is the number of
 windows for each Learner times the number of Learners, the Learners process
 their windows in parallel, thus the longest computation path is as long as the
 number of windows that each Learner processes, which is a reasonable
-approximation for parallel performance. Moreover, the tests have shown that the
+approximation for parallel performance. Moreover, the tests have shown that
-training steps dominate the running time (the pipeline with a single Learner
+Learners dominate the running time (the pipeline with a single Learner could
-could process around 45 batches/s, but over 500 batches/s when the call to the
+process around 45 batches/s, but over 500 batches/s when the call to the
-training function was commented out), therefore the number of context windows
+training function in the Learner was commented out), therefore the number of
-processed by Learners is the most important parameter for the overall
+context windows processed by Learners is the most important parameter for the
-performance. It is also possible to count the processed batches and not the
+overall performance. It is also possible to count the processed batches and not
-context windows, however it may be interesting to compare the influence of the
+the context windows, however it may be interesting to compare the influence of
-number of the context windows in a batch (i.e.\@ the \textit{batch size}) on
+the number of the context windows in a batch (i.e.\@ the \textit{batch size})
-the training performance, such that e.g.\@ increasing the batch size might
+on the training performance, such that e.g.\@ increasing the batch size might
 actually reduce the amount of data needed for training.
 The wall time was only used as a secondary measure, since due to time
@@ -405,7 +405,7 @@ Another dataset was a part of a recent English Wikipedia dump~\cite{wikidump}
 (approx.\@ 90M words), which was transformed into plain text using the
 WikiExtractor~\cite{wikiextractor} tool. For this dataset the vocabulary is the
 list of 10000 most frequently used English words, obtained
-from~\cite{10k-words}, again, excluding the stop words. As a test data, 5000
+from~\cite{10k-words}, also excluding the stop words. As a test data, 5000
 context windows were randomly sampled from the dump file.
 The test configurations were:
@@ -419,10 +419,11 @@ The test configurations were:
 For the smaller of the two datasets the target was set to \verb|8.4|, and it
 can be observed in \autoref{fig:datasets}, that modest speedups can be achieved
-when going from 1 Learner to 2 or 4 Learners; employing 8 Learners or more,
+by employing up to 8 Learners, with the system maxing out on 2.4x speed-up.
-however, doesn't result in any further improvement, with the system maxing out
+Furthermore, a \mbox{2 Learner -- 2 Pipeline} configuration training
-on 1.6x speed up. A possible explanation for this is that the ``Moby Dick''
+independently on two different halves of the book never even reaches the
-book is too small for multiple Learners to have sufficient data to train on.
+target. A possible explanation for this is that the ``Moby Dick'' book is too
 small for multiple Learners to have sufficient data to train on.
 For the larger dataset with the target set to \verb|8.3|, however, the results
 were more promising, as can be seen in \autoref{fig:datasets} and
@@ -439,19 +440,18 @@ observable, but sub-linear speedups, with the 12 Learner System using 7x less
 data per Learner to achieve the target loss of \verb|8.3|. This decrease in
 gains can probably be linked to the deficiencies of the neural network model
 being used, and thus, to achieve further speed-ups, the network architecture
-and training hyperparameters has to be investigated in more depth. Furthermore,
+and training hyperparameters have to be investigated in more depth.
-the loss plots suggest that for longer training the difference between
+Furthermore, the loss plots suggest that for longer training the difference
-configurations with different number of Learners should still be observable,
+between configurations with different number of Learners should still be
-however, due to time and hardware constraints it was not possible to
+observable, however, due to time and hardware constraints it was not possible
-investigate the speed-ups achieved in longer running trials in more detail.
+to investigate the speed-ups achieved in longer running trials in more detail.
 Finally, as can be observed in \autoref{fig:datasets} and
 \autoref{fig:speedups}, the systems with individual pipelines with independent
 input data for each Learner initially perform and scale worse than the
 single-pipeline systems. However, in the later stages of training the effect of
-using multiple pipelines becomes more positive, e.g.\@ the
+using multiple pipelines becomes more positive, e.g.\@ the \mbox{4 Learner -- 4
-\mbox{4 Learner -- 4 Pipeline} system almost catches up with the
+  Pipeline} system almost catches up with the \mbox{12 Learner -- 1 Pipeline}
 \mbox{12 Learner -- 1 Pipeline}
 system. Since input pipelines are computationally cheap, and it is
 computationally viable not to store the data as one big file but rather have it
 split across multiple nodes, this mode of operation should be investigated
@@ -467,7 +467,7 @@ further and possibly preferred for large-scale training.
 \begin{figure}
  \centering
  \includegraphics[width=\linewidth]{fig/speedups.pdf}
-  \caption{Scalability Results with the English Wikipedia Dataset}
+  \caption{Scalability}
  \label{fig:speedups}
 \end{figure}