tokenize FASTER
This commit is contained in:
14
bridge.pyx
14
bridge.pyx
@@ -59,11 +59,21 @@ cdef public int get_tokens(WordList* wl, const char *filename):
|
|||||||
cdef public long vocab_idx_of(Word* w):
|
cdef public long vocab_idx_of(Word* w):
|
||||||
word = w.data.decode('utf-8')
|
word = w.data.decode('utf-8')
|
||||||
try:
|
try:
|
||||||
return nn.vocab.index(word)
|
return nn.vocab[word]
|
||||||
except ValueError:
|
except KeyError:
|
||||||
return -1
|
return -1
|
||||||
|
|
||||||
|
|
||||||
|
cdef public void f_idx_list_to_print(float* f_idxs, size_t num):
|
||||||
|
idxs = np.asarray(<float[:num]>f_idxs).astype(np.int)
|
||||||
|
cdef str pyuni = ' '.join(nn.inv_vocab[i] for i in idxs)
|
||||||
|
print(pyuni)
|
||||||
|
# cdef bytes b = pyuni.encode('utf-8')
|
||||||
|
# cdef char* retval = <char*>malloc((len(b) + 1) * sizeof(char))
|
||||||
|
# retval[len(b)] = 0
|
||||||
|
# return retval
|
||||||
|
|
||||||
|
|
||||||
cdef public void c_onehot(float* y, float* idxs, size_t n_idx):
|
cdef public void c_onehot(float* y, float* idxs, size_t n_idx):
|
||||||
oh = nn.onehot(np.asarray(<float[:n_idx]>idxs))
|
oh = nn.onehot(np.asarray(<float[:n_idx]>idxs))
|
||||||
ensure_contiguous(oh)
|
ensure_contiguous(oh)
|
||||||
|
|||||||
19
library.py
19
library.py
@@ -15,12 +15,13 @@ def word_tokenize(s: str):
|
|||||||
|
|
||||||
HERE = os.path.abspath(os.path.dirname(__file__))
|
HERE = os.path.abspath(os.path.dirname(__file__))
|
||||||
CORPUS = os.path.join(HERE, 'melville-moby_dick.txt')
|
CORPUS = os.path.join(HERE, 'melville-moby_dick.txt')
|
||||||
# sw = set(stopwords.words('english'))
|
VOCAB = os.path.join(HERE, 'vocab.txt')
|
||||||
sw = ['the']
|
|
||||||
vocab = list(set(
|
vocab = {
|
||||||
w.lower() for w in word_tokenize(open(CORPUS).read())
|
w: i for i, w in enumerate(open(VOCAB).read().splitlines(keepends=False))
|
||||||
if w.isalpha() and not w.lower() in sw
|
}
|
||||||
))
|
# inv_vocab = [vocab[i] for i in range(len(vocab))]
|
||||||
|
inv_vocab = sorted(vocab, key=vocab.get)
|
||||||
|
|
||||||
|
|
||||||
def create_mnist_network():
|
def create_mnist_network():
|
||||||
@@ -33,9 +34,9 @@ def create_mnist_network():
|
|||||||
return model
|
return model
|
||||||
|
|
||||||
|
|
||||||
def create_cbow_network(win, vocab, embed):
|
def create_cbow_network(win, vocsize, embed):
|
||||||
ctxt = tf.keras.layers.Input(shape=[win])
|
ctxt = tf.keras.layers.Input(shape=[win])
|
||||||
ed = tf.keras.layers.Embedding(vocab, embed, input_length=win)(ctxt)
|
ed = tf.keras.layers.Embedding(vocsize, embed, input_length=win)(ctxt)
|
||||||
avgd = tf.keras.layers.Lambda(lambda x: tf.reduce_mean(x, axis=1))(ed)
|
avgd = tf.keras.layers.Lambda(lambda x: tf.reduce_mean(x, axis=1))(ed)
|
||||||
mod = tf.keras.Model(inputs=ctxt, outputs=avgd)
|
mod = tf.keras.Model(inputs=ctxt, outputs=avgd)
|
||||||
mod.compile(
|
mod.compile(
|
||||||
@@ -47,7 +48,7 @@ def create_cbow_network(win, vocab, embed):
|
|||||||
|
|
||||||
def token_generator(filename):
|
def token_generator(filename):
|
||||||
with open(filename) as f:
|
with open(filename) as f:
|
||||||
for l in f.readlines():
|
for i, l in enumerate(f.readlines()):
|
||||||
if not l.isspace():
|
if not l.isspace():
|
||||||
tok = word_tokenize(l)
|
tok = word_tokenize(l)
|
||||||
if tok:
|
if tok:
|
||||||
|
|||||||
34
main.c
34
main.c
@@ -17,7 +17,7 @@
|
|||||||
|
|
||||||
#define COMM 100
|
#define COMM 100
|
||||||
#define ITER 20
|
#define ITER 20
|
||||||
#define BS 20
|
#define BS 10
|
||||||
#define EMB 20
|
#define EMB 20
|
||||||
#define WIN 2
|
#define WIN 2
|
||||||
#define FSPC 1
|
#define FSPC 1
|
||||||
@@ -99,14 +99,6 @@ Role map_node() {
|
|||||||
exit(1); // this is bad
|
exit(1); // this is bad
|
||||||
}
|
}
|
||||||
|
|
||||||
void free_weightlist(WeightList* wl) {
|
|
||||||
for in_range(i, wl->n_weights) {
|
|
||||||
free(wl->weights[i].shape);
|
|
||||||
free(wl->weights[i].W);
|
|
||||||
}
|
|
||||||
free(wl->weights);
|
|
||||||
}
|
|
||||||
|
|
||||||
void free_word(Word* w) {
|
void free_word(Word* w) {
|
||||||
free(w->data);
|
free(w->data);
|
||||||
w->data = NULL;
|
w->data = NULL;
|
||||||
@@ -145,11 +137,9 @@ void tokenizer(const char* source) {
|
|||||||
while (get_tokens(&wl, source)) {
|
while (get_tokens(&wl, source)) {
|
||||||
for in_range(i, wl.n_words) {
|
for in_range(i, wl.n_words) {
|
||||||
send_word(&wl.words[i], mpi_id_from_role_id(FILTERER, 0));
|
send_word(&wl.words[i], mpi_id_from_role_id(FILTERER, 0));
|
||||||
// printf("OI %s\n", wl.words[i].data);
|
|
||||||
}
|
}
|
||||||
// INFO_PRINTLN("");
|
|
||||||
}
|
}
|
||||||
Word terminator = {0, ""};
|
Word terminator = {1, ""};
|
||||||
send_word(&terminator, mpi_id_from_role_id(FILTERER, 0));
|
send_word(&terminator, mpi_id_from_role_id(FILTERER, 0));
|
||||||
free_wordlist(&wl);
|
free_wordlist(&wl);
|
||||||
}
|
}
|
||||||
@@ -162,9 +152,7 @@ void filterer() {
|
|||||||
if (!strlen(w.data)) {
|
if (!strlen(w.data)) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
// INFO_PRINTF("%s: ", w.data);
|
|
||||||
idx = vocab_idx_of(&w);
|
idx = vocab_idx_of(&w);
|
||||||
// INFO_PRINTF("%ld\n", idx);
|
|
||||||
if (idx != -1) {
|
if (idx != -1) {
|
||||||
MPI_Send(&idx, 1, MPI_LONG, mpi_id_from_role_id(BATCHER, 0),
|
MPI_Send(&idx, 1, MPI_LONG, mpi_id_from_role_id(BATCHER, 0),
|
||||||
TAG_IWORD, MPI_COMM_WORLD);
|
TAG_IWORD, MPI_COMM_WORLD);
|
||||||
@@ -194,11 +182,11 @@ void batcher() {
|
|||||||
}
|
}
|
||||||
if (l_wid == -1) break;
|
if (l_wid == -1) break;
|
||||||
|
|
||||||
for in_range(i, n_words) {
|
// f_idx_list_to_c_string(f_widx, n_words);
|
||||||
INFO_PRINTF("%5.0f ", f_widx[i]);
|
// for in_range(i, n_words) {
|
||||||
}
|
// INFO_PRINTF("%5.0f ", f_widx[i]);
|
||||||
|
// }
|
||||||
INFO_PRINTLN("");
|
// INFO_PRINTLN("");
|
||||||
// MPI_Recv(&s, 1, MPI_INT, MPI_ANY_SOURCE, TAG_READY, MPI_COMM_WORLD,
|
// MPI_Recv(&s, 1, MPI_INT, MPI_ANY_SOURCE, TAG_READY, MPI_COMM_WORLD,
|
||||||
// MPI_STATUS_IGNORE);
|
// MPI_STATUS_IGNORE);
|
||||||
// if (s != -1) {
|
// if (s != -1) {
|
||||||
@@ -208,6 +196,14 @@ void batcher() {
|
|||||||
free(f_widx);
|
free(f_widx);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void free_weightlist(WeightList* wl) {
|
||||||
|
for in_range(i, wl->n_weights) {
|
||||||
|
free(wl->weights[i].shape);
|
||||||
|
free(wl->weights[i].W);
|
||||||
|
}
|
||||||
|
free(wl->weights);
|
||||||
|
}
|
||||||
|
|
||||||
void send_weights(const WeightList* wl, int dest, int tag) {
|
void send_weights(const WeightList* wl, int dest, int tag) {
|
||||||
// This assumes that the receiving end knows exactly
|
// This assumes that the receiving end knows exactly
|
||||||
// the number of elements being sent and has memory ready
|
// the number of elements being sent and has memory ready
|
||||||
|
|||||||
Reference in New Issue
Block a user