library(tensorflow)
library(keras)
library(data.table)
library(tfdatasets)
library(tfaddons)
# Preprocessing -----------------------------------------------------------
# Assumes you've downloaded and unzipped one of the bilingual datasets offered at
# http://www.manythings.org/anki/ and put it into a directory "data"
# This example translates English to Dutch.
download_data = function(){
if(!dir.exists('data')) {
dir.create('data')
}
if(!file.exists('data/nld-eng.zip')) {
download.file('http://www.manythings.org/anki/nld-eng.zip',
destfile = file.path("data", basename('nld-eng.zip')))
unzip('data/nld-eng.zip', exdir = 'data')
}
}
download_data()
filepath <- file.path("data", "nld.txt")
df = data.table::fread(filepath, header = FALSE, encoding = 'UTF-8',
select = c(1,2), nrows = -1)
text_cleaner <- function(text){
text = text %>%
# replace non ascii
textclean::replace_non_ascii() %>%
# remove all non relevant symbols (letters, spaces, and apostrophes are retained)
textclean::strip(apostrophe.remove = TRUE) %>%
paste('<start> ', ., ' <end>')
}
df = sapply(1:2, function(x) text_cleaner(df[[x]])) %>% as.data.table()
text_tok <- function(text) {
tokenizer = text_tokenizer(filters='')
tokenizer %>% fit_text_tokenizer(text)
vocab_size = tokenizer$word_index
data = tokenizer %>%
texts_to_sequences(text) %>%
pad_sequences(padding='post')
list(vocab_size,data,tokenizer)
}
c(input_vocab_size, data_en, tokenizer_en) %<-% c(df[['V1']] %>% text_tok())
c(output_vocab_size, data_de, tokenizer_de) %<-% c(df[['V2']] %>% text_tok())
# Split the dataset
indices_to_take = sample.int(n = nrow(df), size = floor(0.8*nrow(df)), replace = FALSE)
split_data <- function(data) {
c(train, test) %<-% list(data[indices_to_take, ], data[-indices_to_take, ] )
list(train, test)
}
c(en_train, en_test, de_train, de_test) %<-% c(split_data(data_en), split_data(data_de))
rm(df, filepath, indices_to_take, download_data, split_data, text_cleaner, text_tok)
batch_size = 64L
buffer_size = nrow(en_train)
steps_per_epoch = buffer_size %/% batch_size
embedding_dims = 256L
rnn_units = 1024L
dense_units = 1024L
dtype = tf$float32 #used to initialize DecoderCell Zero state
dataset = tensor_slices_dataset(list(en_train, de_train)) %>%
dataset_shuffle(buffer_size) %>% dataset_batch(batch_size, drop_remainder = TRUE)
EncoderNetwork = reticulate::PyClass(
'EncoderNetwork',
inherit = tf$keras$Model,
defs = list(
`__init__` = function(self, input_vocab_size, embedding_dims, rnn_units) {
super()$`__init__`()
self$encoder_embedding = layer_embedding(input_dim = length(input_vocab_size),
output_dim = embedding_dims)
self$encoder_rnnlayer = layer_lstm(units = rnn_units, return_sequences = TRUE,
return_state = TRUE)
NULL
}
)
)
DecoderNetwork = reticulate::PyClass(
'DecoderNetwork',
inherit = tf$keras$Model,
defs = list(
`__init__` = function(self, output_vocab_size, embedding_dims, rnn_units) {
super()$`__init__`()
self$decoder_embedding = layer_embedding(input_dim = length(output_vocab_size),
output_dim = embedding_dims)
self$dense_layer = layer_dense(units = length(output_vocab_size))
self$decoder_rnncell = tf$keras$layers$LSTMCell(rnn_units)
# Sampler
self$sampler = sampler_training()
# Create attention mechanism with memory = NULL
self$attention_mechanism = self$build_attention_mechanism(dense_units, NULL, c(rep(ncol(data_en), batch_size)))
self$rnn_cell = self$build_rnn_cell(batch_size)
self$decoder = decoder_basic(cell=self$rnn_cell, sampler = self$sampler,
output_layer = self$dense_layer)
NULL
},
build_attention_mechanism = function(self, units, memory, memory_sequence_length) {
attention_luong(units = units , memory = memory,
memory_sequence_length = memory_sequence_length)
},
build_rnn_cell = function(self, batch_size) {
rnn_cell = attention_wrapper(cell = self$decoder_rnncell,
attention_mechanism = self$attention_mechanism,
attention_layer_size = dense_units)
rnn_cell
},
build_decoder_initial_state = function(self, batch_size, encoder_state, dtype) {
decoder_initial_state = self$rnn_cell$get_initial_state(batch_size = batch_size,
dtype = dtype)
decoder_initial_state = decoder_initial_state$clone(cell_state = encoder_state)
decoder_initial_state
}
)
)
encoderNetwork = EncoderNetwork(input_vocab_size, embedding_dims, rnn_units)
decoderNetwork = DecoderNetwork(output_vocab_size, embedding_dims, rnn_units)
optimizer = tf$keras$optimizers$Adam()
loss_function <- function(y_pred, y) {
#shape of y [batch_size, ty]
#shape of y_pred [batch_size, Ty, output_vocab_size]
loss = keras::loss_sparse_categorical_crossentropy(y, y_pred)
mask = tf$logical_not(tf$math$equal(y,0L)) #output 0 for y=0 else output 1
mask = tf$cast(mask, dtype=loss$dtype)
loss = mask * loss
loss = tf$reduce_mean(loss)
loss
}
train_step <- function(input_batch, output_batch,encoder_initial_cell_state) {
loss = 0L
with(tf$GradientTape() %as% tape, {
encoder_emb_inp = encoderNetwork$encoder_embedding(input_batch)
c(a, a_tx, c_tx) %<-% encoderNetwork$encoder_rnnlayer(encoder_emb_inp,
initial_state = encoder_initial_cell_state)
#[last step activations,last memory_state] of encoder passed as input to decoder Network
# Prepare correct Decoder input & output sequence data
decoder_input = tf$convert_to_tensor(output_batch %>% as.array() %>% .[,1:45]) # ignore <end>
#compare logits with timestepped +1 version of decoder_input
decoder_output = tf$convert_to_tensor(output_batch %>% as.array() %>% .[,2:46]) #ignore <start>
# Decoder Embeddings
decoder_emb_inp = decoderNetwork$decoder_embedding(decoder_input)
#Setting up decoder memory from encoder output and Zero State for AttentionWrapperState
decoderNetwork$attention_mechanism$setup_memory(a)
decoder_initial_state = decoderNetwork$build_decoder_initial_state(batch_size,
encoder_state = list(a_tx, c_tx),
dtype = tf$float32)
#BasicDecoderOutput
c(outputs, res1, res2) %<-% decoderNetwork$decoder(decoder_emb_inp,initial_state = decoder_initial_state,
sequence_length = c(rep(ncol(data_en) - 1L, batch_size)))
logits = outputs$rnn_output
#Calculate loss
loss = loss_function(logits, decoder_output)
})
#Returns the list of all layer variables / weights.
variables = c(encoderNetwork$trainable_variables, decoderNetwork$trainable_variables)
# differentiate loss wrt variables
gradients = tape$gradient(loss, variables)
#grads_and_vars – List of(gradient, variable) pairs.
grads_and_vars = purrr::transpose(list(gradients,variables))
optimizer$apply_gradients(grads_and_vars)
loss
}
initialize_initial_state = function() {
list(tf$zeros(c(batch_size, rnn_units)), tf$zeros(c(batch_size, rnn_units)))
}
epochs = 1
for (i in 1:sum(epochs + 1)) {
encoder_initial_cell_state = initialize_initial_state()
total_loss = 0.0
res = dataset %>% dataset_take(steps_per_epoch) %>% iterate()
for (batch in 1:length(res)) {
c(input_batch, output_batch) %<-% res[[batch]]
batch_loss = train_step(input_batch, output_batch, encoder_initial_cell_state)
total_loss = total_loss + batch_loss
if((batch+1) %% 5 == 0) {
print(paste('total loss:', batch_loss$numpy(), 'epoch', i, 'batch',batch+1))
}
}
}