Files
fedavg_mpi/library.py

78 lines
2.2 KiB
Python

import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import numpy as np
import tensorflow as tf
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR) # STFU!
HERE = os.path.abspath(os.path.dirname(__file__))
CORPUS = os.path.join(HERE, 'melville-moby_dick.txt')
VOCAB = os.path.join(HERE, 'vocab.txt')
vocab = {
w: i for i, w in enumerate(open(VOCAB).read().splitlines(keepends=False))
}
inv_vocab = sorted(vocab, key=vocab.get)
def onehot(oh_store, idx):
oh_store[:] = 0
oh_store[np.arange(len(idx)), idx.astype(np.int)] = 1
def word_tokenize(s: str):
l = ''.join(c.lower() if c.isalpha() else ' ' for c in s)
return l.split()
def create_test_dataset(win):
S = 1000
with open(CORPUS) as f:
ds = np.array([vocab[w] for w in word_tokenize(f.read())
if w in vocab])
idx = np.random.choice(np.arange(win, len(ds) - win), S)
oh_store = np.zeros((S, len(vocab)), dtype=np.float32)
onehot(oh_store, ds[idx])
return (
# X
np.stack([
np.concatenate([ds[i-win:i], ds[i+1:i+win+1]])
for i in idx
], axis=0).astype(np.float32),
#y
oh_store
)
def create_mnist_network():
model = tf.keras.models.Sequential([
tf.keras.layers.Dense(30, input_shape=(784,), activation='relu'),
tf.keras.layers.Dense(10, activation='softmax')
])
model.compile(loss='categorical_crossentropy', optimizer='sgd',
metrics=['accuracy'])
return model
def create_cbow_network(win, embed):
ctxt = tf.keras.layers.Input(shape=[2*win])
ed = tf.keras.layers.Embedding(len(vocab), embed, input_length=2*win)(ctxt)
cbow = tf.keras.layers.Lambda(lambda x: tf.reduce_mean(x, axis=1))(ed)
blowup = tf.keras.layers.Dense(len(vocab), activation='softmax')(cbow)
mod = tf.keras.Model(inputs=ctxt, outputs=blowup)
mod.compile(
optimizer='sgd',
loss='categorical_crossentropy',
)
return mod
def token_generator(filename):
with open(filename) as f:
for i, l in enumerate(f.readlines()):
if not l.isspace():
tok = word_tokenize(l)
if tok:
yield tok