tokenize FASTER

This commit is contained in:
2019-12-01 09:44:40 -08:00
parent 7409eca38b
commit 569bbf7397
3 changed files with 37 additions and 30 deletions

View File

@@ -59,11 +59,21 @@ cdef public int get_tokens(WordList* wl, const char *filename):
cdef public long vocab_idx_of(Word* w):
word = w.data.decode('utf-8')
try:
return nn.vocab.index(word)
except ValueError:
return nn.vocab[word]
except KeyError:
return -1
cdef public void f_idx_list_to_print(float* f_idxs, size_t num):
idxs = np.asarray(<float[:num]>f_idxs).astype(np.int)
cdef str pyuni = ' '.join(nn.inv_vocab[i] for i in idxs)
print(pyuni)
# cdef bytes b = pyuni.encode('utf-8')
# cdef char* retval = <char*>malloc((len(b) + 1) * sizeof(char))
# retval[len(b)] = 0
# return retval
cdef public void c_onehot(float* y, float* idxs, size_t n_idx):
oh = nn.onehot(np.asarray(<float[:n_idx]>idxs))
ensure_contiguous(oh)

View File

@@ -15,12 +15,13 @@ def word_tokenize(s: str):
HERE = os.path.abspath(os.path.dirname(__file__))
CORPUS = os.path.join(HERE, 'melville-moby_dick.txt')
# sw = set(stopwords.words('english'))
sw = ['the']
vocab = list(set(
w.lower() for w in word_tokenize(open(CORPUS).read())
if w.isalpha() and not w.lower() in sw
))
VOCAB = os.path.join(HERE, 'vocab.txt')
vocab = {
w: i for i, w in enumerate(open(VOCAB).read().splitlines(keepends=False))
}
# inv_vocab = [vocab[i] for i in range(len(vocab))]
inv_vocab = sorted(vocab, key=vocab.get)
def create_mnist_network():
@@ -33,9 +34,9 @@ def create_mnist_network():
return model
def create_cbow_network(win, vocab, embed):
def create_cbow_network(win, vocsize, embed):
ctxt = tf.keras.layers.Input(shape=[win])
ed = tf.keras.layers.Embedding(vocab, embed, input_length=win)(ctxt)
ed = tf.keras.layers.Embedding(vocsize, embed, input_length=win)(ctxt)
avgd = tf.keras.layers.Lambda(lambda x: tf.reduce_mean(x, axis=1))(ed)
mod = tf.keras.Model(inputs=ctxt, outputs=avgd)
mod.compile(
@@ -47,7 +48,7 @@ def create_cbow_network(win, vocab, embed):
def token_generator(filename):
with open(filename) as f:
for l in f.readlines():
for i, l in enumerate(f.readlines()):
if not l.isspace():
tok = word_tokenize(l)
if tok:

34
main.c
View File

@@ -17,7 +17,7 @@
#define COMM 100
#define ITER 20
#define BS 20
#define BS 10
#define EMB 20
#define WIN 2
#define FSPC 1
@@ -99,14 +99,6 @@ Role map_node() {
exit(1); // this is bad
}
void free_weightlist(WeightList* wl) {
for in_range(i, wl->n_weights) {
free(wl->weights[i].shape);
free(wl->weights[i].W);
}
free(wl->weights);
}
void free_word(Word* w) {
free(w->data);
w->data = NULL;
@@ -145,11 +137,9 @@ void tokenizer(const char* source) {
while (get_tokens(&wl, source)) {
for in_range(i, wl.n_words) {
send_word(&wl.words[i], mpi_id_from_role_id(FILTERER, 0));
// printf("OI %s\n", wl.words[i].data);
}
// INFO_PRINTLN("");
}
Word terminator = {0, ""};
Word terminator = {1, ""};
send_word(&terminator, mpi_id_from_role_id(FILTERER, 0));
free_wordlist(&wl);
}
@@ -162,9 +152,7 @@ void filterer() {
if (!strlen(w.data)) {
break;
}
// INFO_PRINTF("%s: ", w.data);
idx = vocab_idx_of(&w);
// INFO_PRINTF("%ld\n", idx);
if (idx != -1) {
MPI_Send(&idx, 1, MPI_LONG, mpi_id_from_role_id(BATCHER, 0),
TAG_IWORD, MPI_COMM_WORLD);
@@ -194,11 +182,11 @@ void batcher() {
}
if (l_wid == -1) break;
for in_range(i, n_words) {
INFO_PRINTF("%5.0f ", f_widx[i]);
}
INFO_PRINTLN("");
// f_idx_list_to_c_string(f_widx, n_words);
// for in_range(i, n_words) {
// INFO_PRINTF("%5.0f ", f_widx[i]);
// }
// INFO_PRINTLN("");
// MPI_Recv(&s, 1, MPI_INT, MPI_ANY_SOURCE, TAG_READY, MPI_COMM_WORLD,
// MPI_STATUS_IGNORE);
// if (s != -1) {
@@ -208,6 +196,14 @@ void batcher() {
free(f_widx);
}
void free_weightlist(WeightList* wl) {
for in_range(i, wl->n_weights) {
free(wl->weights[i].shape);
free(wl->weights[i].W);
}
free(wl->weights);
}
void send_weights(const WeightList* wl, int dest, int tag) {
// This assumes that the receiving end knows exactly
// the number of elements being sent and has memory ready