tokenize FASTER
This commit is contained in:
14
bridge.pyx
14
bridge.pyx
@@ -59,11 +59,21 @@ cdef public int get_tokens(WordList* wl, const char *filename):
|
||||
cdef public long vocab_idx_of(Word* w):
|
||||
word = w.data.decode('utf-8')
|
||||
try:
|
||||
return nn.vocab.index(word)
|
||||
except ValueError:
|
||||
return nn.vocab[word]
|
||||
except KeyError:
|
||||
return -1
|
||||
|
||||
|
||||
cdef public void f_idx_list_to_print(float* f_idxs, size_t num):
|
||||
idxs = np.asarray(<float[:num]>f_idxs).astype(np.int)
|
||||
cdef str pyuni = ' '.join(nn.inv_vocab[i] for i in idxs)
|
||||
print(pyuni)
|
||||
# cdef bytes b = pyuni.encode('utf-8')
|
||||
# cdef char* retval = <char*>malloc((len(b) + 1) * sizeof(char))
|
||||
# retval[len(b)] = 0
|
||||
# return retval
|
||||
|
||||
|
||||
cdef public void c_onehot(float* y, float* idxs, size_t n_idx):
|
||||
oh = nn.onehot(np.asarray(<float[:n_idx]>idxs))
|
||||
ensure_contiguous(oh)
|
||||
|
||||
19
library.py
19
library.py
@@ -15,12 +15,13 @@ def word_tokenize(s: str):
|
||||
|
||||
HERE = os.path.abspath(os.path.dirname(__file__))
|
||||
CORPUS = os.path.join(HERE, 'melville-moby_dick.txt')
|
||||
# sw = set(stopwords.words('english'))
|
||||
sw = ['the']
|
||||
vocab = list(set(
|
||||
w.lower() for w in word_tokenize(open(CORPUS).read())
|
||||
if w.isalpha() and not w.lower() in sw
|
||||
))
|
||||
VOCAB = os.path.join(HERE, 'vocab.txt')
|
||||
|
||||
vocab = {
|
||||
w: i for i, w in enumerate(open(VOCAB).read().splitlines(keepends=False))
|
||||
}
|
||||
# inv_vocab = [vocab[i] for i in range(len(vocab))]
|
||||
inv_vocab = sorted(vocab, key=vocab.get)
|
||||
|
||||
|
||||
def create_mnist_network():
|
||||
@@ -33,9 +34,9 @@ def create_mnist_network():
|
||||
return model
|
||||
|
||||
|
||||
def create_cbow_network(win, vocab, embed):
|
||||
def create_cbow_network(win, vocsize, embed):
|
||||
ctxt = tf.keras.layers.Input(shape=[win])
|
||||
ed = tf.keras.layers.Embedding(vocab, embed, input_length=win)(ctxt)
|
||||
ed = tf.keras.layers.Embedding(vocsize, embed, input_length=win)(ctxt)
|
||||
avgd = tf.keras.layers.Lambda(lambda x: tf.reduce_mean(x, axis=1))(ed)
|
||||
mod = tf.keras.Model(inputs=ctxt, outputs=avgd)
|
||||
mod.compile(
|
||||
@@ -47,7 +48,7 @@ def create_cbow_network(win, vocab, embed):
|
||||
|
||||
def token_generator(filename):
|
||||
with open(filename) as f:
|
||||
for l in f.readlines():
|
||||
for i, l in enumerate(f.readlines()):
|
||||
if not l.isspace():
|
||||
tok = word_tokenize(l)
|
||||
if tok:
|
||||
|
||||
34
main.c
34
main.c
@@ -17,7 +17,7 @@
|
||||
|
||||
#define COMM 100
|
||||
#define ITER 20
|
||||
#define BS 20
|
||||
#define BS 10
|
||||
#define EMB 20
|
||||
#define WIN 2
|
||||
#define FSPC 1
|
||||
@@ -99,14 +99,6 @@ Role map_node() {
|
||||
exit(1); // this is bad
|
||||
}
|
||||
|
||||
void free_weightlist(WeightList* wl) {
|
||||
for in_range(i, wl->n_weights) {
|
||||
free(wl->weights[i].shape);
|
||||
free(wl->weights[i].W);
|
||||
}
|
||||
free(wl->weights);
|
||||
}
|
||||
|
||||
void free_word(Word* w) {
|
||||
free(w->data);
|
||||
w->data = NULL;
|
||||
@@ -145,11 +137,9 @@ void tokenizer(const char* source) {
|
||||
while (get_tokens(&wl, source)) {
|
||||
for in_range(i, wl.n_words) {
|
||||
send_word(&wl.words[i], mpi_id_from_role_id(FILTERER, 0));
|
||||
// printf("OI %s\n", wl.words[i].data);
|
||||
}
|
||||
// INFO_PRINTLN("");
|
||||
}
|
||||
Word terminator = {0, ""};
|
||||
Word terminator = {1, ""};
|
||||
send_word(&terminator, mpi_id_from_role_id(FILTERER, 0));
|
||||
free_wordlist(&wl);
|
||||
}
|
||||
@@ -162,9 +152,7 @@ void filterer() {
|
||||
if (!strlen(w.data)) {
|
||||
break;
|
||||
}
|
||||
// INFO_PRINTF("%s: ", w.data);
|
||||
idx = vocab_idx_of(&w);
|
||||
// INFO_PRINTF("%ld\n", idx);
|
||||
if (idx != -1) {
|
||||
MPI_Send(&idx, 1, MPI_LONG, mpi_id_from_role_id(BATCHER, 0),
|
||||
TAG_IWORD, MPI_COMM_WORLD);
|
||||
@@ -194,11 +182,11 @@ void batcher() {
|
||||
}
|
||||
if (l_wid == -1) break;
|
||||
|
||||
for in_range(i, n_words) {
|
||||
INFO_PRINTF("%5.0f ", f_widx[i]);
|
||||
}
|
||||
|
||||
INFO_PRINTLN("");
|
||||
// f_idx_list_to_c_string(f_widx, n_words);
|
||||
// for in_range(i, n_words) {
|
||||
// INFO_PRINTF("%5.0f ", f_widx[i]);
|
||||
// }
|
||||
// INFO_PRINTLN("");
|
||||
// MPI_Recv(&s, 1, MPI_INT, MPI_ANY_SOURCE, TAG_READY, MPI_COMM_WORLD,
|
||||
// MPI_STATUS_IGNORE);
|
||||
// if (s != -1) {
|
||||
@@ -208,6 +196,14 @@ void batcher() {
|
||||
free(f_widx);
|
||||
}
|
||||
|
||||
void free_weightlist(WeightList* wl) {
|
||||
for in_range(i, wl->n_weights) {
|
||||
free(wl->weights[i].shape);
|
||||
free(wl->weights[i].W);
|
||||
}
|
||||
free(wl->weights);
|
||||
}
|
||||
|
||||
void send_weights(const WeightList* wl, int dest, int tag) {
|
||||
// This assumes that the receiving end knows exactly
|
||||
// the number of elements being sent and has memory ready
|
||||
|
||||
Reference in New Issue
Block a user