fedavg_mpi/docs/generate_plots.py

import os
import re
from math import floor, ceil

import matplotlib.pyplot as plt
import numpy as np


HERE = os.path.abspath(os.path.dirname(__file__))
LOGS = os.path.join(HERE, '../../docs/logs/')


datasets = {
    'moby': {
        'idx': 0,
        'name': 'Moby Dick',
        'words': '200k',
        'target': 8.4,
        'lim': (16000, 320000)
    },
    'wiki': {
        'name': 'English Wikipedia',
        'words': '90M',
        'idx': 1,
        'target': 8.3,
        'lim': (16000, 360000)
    }
}


def s(n):
    return 's' if n > 1 else ''


def idx_of(l, cond=lambda x: x):
    try:
        return next(i for i, e in enumerate(l) if cond(e))
    except StopIteration:
        return -1


def meta_from_fn(fn):
    m = re.search(r'(.+)_(\d+)_learner_(\d+)_pp', fn)
    return (lambda x: (x[0], int(x[1]), int(x[2])))(
        m.group(1,2,3)
    )


if __name__ == '__main__':
    files = sorted(os.listdir(LOGS), key= lambda x: meta_from_fn(x)[1])

    fig = plt.figure(figsize=(10, 4))
    fig.subplots_adjust(left=0.06, right=0.99, top=0.91, wspace=0.18)
    axs = fig.subplots(1, len(datasets))
    pp_speedup = {
        'wiki': [],
    }
    l_speedup = {
        'moby': [],
        'wiki': [],
    }

    for fn in files:
        name, learners, pipelines = meta_from_fn(fn)
        if learners == 16:
            continue
        with open(os.path.join(LOGS, fn)) as f:
            lines = f.readlines()
        matches = [re.search(r'windows (\d+) validation loss (\d+\.\d+)', l)
                   for l in lines]
        matches = [m for m in matches if m is not None]
        win_loss = [
            (lambda x: (int(x[0]), float(x[1])))(m.group(1, 2)) for m in matches
        ]
        windows, loss = zip(*win_loss)
        axs[datasets[name]['idx']].plot(
            windows[1:], loss[1:], linestyle='-' * (1 + (pipelines>1)),
            color=f'C{learners // 2}',
            label=f'{learners} Learner{s(learners)},'
            f' {pipelines} Pipeline{s(pipelines)}'
        )
        ttt = windows[idx_of(loss, lambda l: l < datasets[name]['target'])]
        if (pipelines > 1 or learners == 1) and name == 'wiki':
            pp_speedup[name].append((pipelines, ttt))
        if pipelines == 1:
            l_speedup[name].append((learners, ttt))

    for d in datasets.values():
        a = axs[d['idx']]
        a.set_xlabel('Context Windows per Learner')
        a.set_ylabel('Validation Loss')
        a.set_xticks([windows[1]] + [*range(0, 300001, 100000)])
        a.set_xlim(*d['lim'])
        a.set_title(f'{d["name"]} (~{d["words"]} words)')
        a.legend()
        a.axhline(d['target'], color='k', linestyle=':')

    fig.savefig(os.path.join(HERE, 'fig/datasets.pdf'))

    def speedup_plot(zipped):
        min_f = []
        max_f = []
        min_s = []
        max_s = []
        for z in sorted(zipped, key=lambda x: datasets[x]['idx']):
            d = datasets[z]
            factors, time = zip(*sorted(zipped[z]))
            time = np.asarray(time)
            speedup = time[0] / time
            print(factors, time)
            plt.plot(
                factors, speedup,
                label=f'{d["name"]}, target: {d["target"]}',
                color=f'C{d["idx"]}'
            )
            min_s.append(min(speedup))
            max_s.append(max(speedup))
            min_f.append(min(factors))
            max_f.append(max(factors))

        plt.xlim(min(min_f), max(max_f))
        plt.ylim(min(min_s), max(max_s))
        plt.xticks([*range(min(min_f), max(max_f) + 1)])
        plt.yticks([*range(floor(min(min_s)), ceil(max(max_s)) + 1)])
        plt.legend(loc='upper left')
        plt.grid()

    fig = plt.figure(figsize=(10, 4))
    fig.subplots_adjust(left=0.06, right=0.99, top=0.91, wspace=0.18)

    plt.subplot(121)
    speedup_plot(l_speedup)
    plt.title('Single Pipeline')
    plt.xlabel('Number of Learners')
    plt.ylabel(f'Speedup to Target')

    plt.subplot(122)
    speedup_plot(pp_speedup)
    plt.title('Multiple Pipelines')
    plt.xlabel('Number of Pipelines')
    plt.ylabel(f'Speedup to Target')

    plt.savefig(os.path.join(HERE, 'fig/speedups.pdf'))
    plt.show()