def test(self, sess, token_ids): # We decode one sentence at a time. token_ids = data_utils.padding(token_ids) target_ids = data_utils.padding([data_utils.GO_ID]) y_ids = data_utils.padding([data_utils.EOS_ID]) encoder_inputs, decoder_inputs, _, _ = data_utils.nextRandomBatch([(token_ids, target_ids, y_ids)], batch_size=1) prediction = sess.run(self.prediction, feed_dict={ self.encoder_inputs: encoder_inputs, self.decoder_inputs: decoder_inputs }) pred_max = tf.arg_max(prediction, 1) # prediction = tf.split(0, self.num_steps, prediction) # # This is a greedy decoder - outputs are just argmaxes of output_logits. # outputs = [int(np.argmax(predict)) for predict in prediction] # # If there is an EOS symbol in outputs, cut them at that point. # if data_utils.EOS_ID in outputs: # outputs = outputs[:outputs.index(data_utils.EOS_ID)] return pred_max.eval()
def decode_line(sess, model, enc_vocab, rev_dec_vocab, sentence): # Get token-ids for the input sentence. token_ids = data_utils.sentence_to_token_ids(tf.compat.as_bytes(sentence), enc_vocab) # Which bucket does it belong to? bucket_id = min([b for b in xrange(len(_buckets)) if _buckets[b][0] > len(token_ids)]) # Get a 1-element batch to feed the sentence to the model. encoder_inputs, decoder_inputs, target_weights = model.get_batch({bucket_id: [(token_ids, [])]}, bucket_id) # Get output logits for the sentence. _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) # This is a greedy decoder - outputs are just argmaxes of output_logits. outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits] # If there is an EOS symbol in outputs, cut them at that point. if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] return " ".join([tf.compat.as_str(rev_dec_vocab[output]) for output in outputs])
def decode_line(sess, model, enc_vocab, rev_dec_vocab, sentence): # Get token-ids for the input sentence. #print(sentence) #token_ids = data_utils.sentence_to_token_ids(tf.compat.as_bytes(sentence), enc_vocab) token_ids = data_utils.sentence_to_token_ids(sentence, enc_vocab) print(token_ids) # Which bucket does it belong to? bucket_id = min([b for b in xrange(len(_buckets)) if _buckets[b][0] > len(token_ids)]) # Get a 1-element batch to feed the sentence to the model. encoder_inputs, decoder_inputs, target_weights = model.get_batch({bucket_id: [(token_ids, [])]}, bucket_id) _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) # This is a greedy decoder - outputs are just argmaxes of output_logits. outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits] # If there is an EOS symbol in outputs, cut them at that point. if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] return "".join([tf.compat.as_str(rev_dec_vocab[output]) for output in outputs])
def read_data(config, source_path, target_path, max_size=None): data_set = [[] for _ in config.buckets] with gfile.GFile(source_path, mode="r") as source_file: with gfile.GFile(target_path, mode="r") as target_file: source, target = source_file.readline(), target_file.readline() counter = 0 while source and target and (not max_size or counter < max_size): counter += 1 if counter % 100000 == 0: print("reading data line %d" % counter) sys.stdout.flush() source_ids = [int(x) for x in source.strip().split()] target_ids = [int(x) for x in target.strip().split()] target_ids.append(data_utils.EOS_ID) for bucket_id, (source_size, target_size) in enumerate(config.buckets): if len(source_ids) < source_size and len(target_ids) < target_size: data_set[bucket_id].append([source_ids, target_ids]) break source, target = source_file.readline(), target_file.readline() return data_set
def get_batch(self,data_set,batch_size,random=True): '''get a batch of data from a data_set and do all needed preprocess to make them usable for the model defined above''' if random: seqs = np.random.choice(data_set,size= batch_size) else: seqs = data_set[0:batch_size] encoder_inputs = np.zeros((batch_size,self.max_seq_length),dtype = int) decoder_inputs = np.zeros((batch_size,self.max_seq_length+2),dtype = int) encoder_lengths = np.zeros(batch_size) decoder_weights = np.zeros((batch_size,self.max_seq_length+2),dtype=float) for i,seq in enumerate(seqs): encoder_inputs[i] = np.array(list(reversed(seq))+[data_utils.PAD_ID]*(self.max_seq_length-len(seq))) decoder_inputs[i] = np.array([data_utils.GO_ID]+seq+[data_utils.EOS_ID]+[data_utils.PAD_ID]*(self.max_seq_length-len(seq))) encoder_lengths[i]= len(seq) decoder_weights[i,0:(len(seq)+1)]=1.0 return np.transpose(encoder_inputs), np.transpose(decoder_inputs), encoder_lengths, np.transpose(decoder_weights)
def read_data(source_path, target_path, max_size=None): data_set = [[] for _ in _buckets] with tf.gfile.GFile(source_path, mode="r") as source_file: with tf.gfile.GFile(target_path, mode="r") as target_file: source, target = source_file.readline(), target_file.readline() counter = 0 while source and target and (not max_size or counter < max_size): counter += 1 if counter % 100000 == 0: print(" reading data line %d" % counter) sys.stdout.flush() source_ids = [int(x) for x in source.split()] target_ids = [int(x) for x in target.split()] target_ids.append(data_utils.EOS_ID) for bucket_id, (source_size, target_size) in enumerate(_buckets): if len(source_ids) < source_size and len(target_ids) < target_size: data_set[bucket_id].append([source_ids, target_ids]) break source, target = source_file.readline(), target_file.readline() return data_set
def run(self, sentence): # Get token-ids for the input sentence. token_ids = data_utils.sentence_to_token_ids(sentence, self.en_vocab) # Which bucket does it belong to? bucket_id = min([b for b in xrange(len(_buckets)) if _buckets[b][0] > len(token_ids)]) # Get a 1-element batch to feed the sentence to the model. encoder_inputs, decoder_inputs, target_weights = self.model.get_batch( {bucket_id: [(token_ids, [])]}, bucket_id) # Get output logits for the sentence. _, _, output_logits = self.model.step(self.sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) # This is a greedy decoder - outputs are just argmaxes of output_logits. outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits] # If there is an EOS symbol in outputs, cut them at that point. if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] # Print out French sentence corresponding to outputs. return "".join([self.rev_fr_vocab[output] for output in outputs])
def get_predicted_sentence(input_sentence, vocab, rev_vocab, model, sess): input_token_ids = data_utils.sentence_to_token_ids(input_sentence, vocab) print(input_token_ids) # Which bucket does it belong to? if len(input_token_ids)>=BUCKETS[-1][0]: input_token_ids = input_token_ids[:BUCKETS[-1][0]-1] bucket_id = min([b for b in xrange(len(BUCKETS)) if BUCKETS[b][0] > len(input_token_ids)]) outputs = [] feed_data = {bucket_id: [(input_token_ids, outputs)]} # Get a 1-element batch to feed the sentence to the model. encoder_inputs, decoder_inputs, target_weights = model.get_batch(feed_data, bucket_id) global_memory['inp']=1 # Get output logits for the sentence. _,_,output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, forward_only=True,beam_search=True) #print('global_output:') #print(global_output) outputs = [] # This is a greedy decoder - outputs are just argmaxes of output_logits. for logit in output_logits: selected_token_id = int(np.argmax(logit, axis=1)) if selected_token_id == data_utils.EOS_ID: break else: outputs.append(selected_token_id) # Forming output sentence on natural language outputs = ' '.join([rev_vocab[i] for i in outputs]) return outputs
def main(_): print("Loading vocabulary") cn_vocab_path = os.path.join(FLAGS.data_dir, "source_vocab.txt") en_vocab_path = os.path.join(FLAGS.data_dir, "target_vocab.txt") cn_vocab, _ = data_utils.initialize_vocabulary(cn_vocab_path) _, rev_en_vocab = data_utils.initialize_vocabulary(en_vocab_path) print("Building model...") config = tf.ConfigProto(allow_soft_placement=True) with tf.Session(config=config) as sess: model = create_model(sess, False) # Decode from standard input. sys.stdout.write("> ") sys.stdout.flush() sentence = sys.stdin.readline() while sentence: seg_list = jieba.lcut(sentence.strip()) #print(" ".join(seg_list)) token_ids = [cn_vocab.get(w.encode(encoding="utf-8"), data_utils.UNK_ID) for w in seg_list] #print(token_ids) outputs = model.test(sess, token_ids) outputs = outputs.tolist() if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] output = " ".join([tf.compat.as_str(rev_en_vocab[output]) for output in outputs]) print(output.capitalize()) print("> ") sys.stdout.flush() sentence = sys.stdin.readline()
def read_data(source_path, target_path, max_size=None): """Read data from source and target files and put into buckets. Args: source_path: path to the files with token-ids for the source language. target_path: path to the file with token-ids for the target language; it must be aligned with the source file: n-th line contains the desired output for n-th line from the source_path. max_size: maximum number of lines to read, all other will be ignored; if 0 or None, data files will be read completely (no limit). Returns: data_set: a list of length len(_buckets); data_set[n] contains a list of (source, target) pairs read from the provided data files that fit into the n-th bucket, i.e., such that len(source) < _buckets[n][0] and len(target) < _buckets[n][1]; source and target are lists of token-ids. """ data_set = [[] for _ in _buckets] with tf.gfile.GFile(source_path, mode="r") as source_file: with tf.gfile.GFile(target_path, mode="r") as target_file: source, target = source_file.readline(), target_file.readline() counter = 0 while source and target and (not max_size or counter < max_size): counter += 1 if counter % 100000 == 0: print(" reading data line %d" % counter) sys.stdout.flush() source_ids = [int(x) for x in source.split()] target_ids = [int(x) for x in target.split()] target_ids.append(data_utils.EOS_ID) for bucket_id, (source_size, target_size) in enumerate(_buckets): if len(source_ids) < source_size and len(target_ids) < target_size: data_set[bucket_id].append([source_ids, target_ids]) break source, target = source_file.readline(), target_file.readline() return data_set
def decode(): with tf.Session() as sess: # Create model and load parameters. model = create_model(sess, True) model.batch_size = 1 # We decode one sentence at a time. # Load vocabularies. src_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.source" % FLAGS.src_vocab_size) tar_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.target" % FLAGS.tar_vocab_size) src_vocab, _ = data_utils.initialize_vocabulary(src_vocab_path) _, rev_tar_vocab = data_utils.initialize_vocabulary(tar_vocab_path) # Decode from standard input. sys.stdout.write("> ") sys.stdout.flush() sentence = sys.stdin.readline() tokenizer = data_utils.basic_char_tokenizer if FLAGS.char else None while sentence: # Get token-ids for the input sentence. token_ids = data_utils.sentence_to_token_ids(tf.compat.as_bytes(sentence), src_vocab, tokenizer) # Which bucket does it belong to? bucket_id = min([b for b in xrange(len(_buckets)) if _buckets[b][0] > len(token_ids)]) # Get a 1-element batch to feed the sentence to the model. encoder_inputs, decoder_inputs, target_weights = model.get_batch( {bucket_id: [(token_ids, [])]}, bucket_id) # Get output logits for the sentence. _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) # This is a greedy decoder - outputs are just argmaxes of output_logits. outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits] # If there is an EOS symbol in outputs, cut them at that point. if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] # Print out target-text sentence corresponding to outputs. print(("" if FLAGS.char else " ").join([tf.compat.as_str(rev_tar_vocab[output]) for output in outputs])) print("> ", end="") sys.stdout.flush() sentence = sys.stdin.readline()
def decode(): with tf.Session() as sess: # Create model and load parameters. model = create_model(sess, True) model.batch_size = 1 # We decode one sentence at a time. # Load vocabularies. _, _, _, _, en_vocab_path, fr_vocab_path = data_utils.prepare_my_data( FLAGS.data_dir, FLAGS.en_vocab_size, FLAGS.fr_vocab_size) en_vocab, _ = data_utils.initialize_vocabulary(en_vocab_path) _, rev_fr_vocab = data_utils.initialize_vocabulary(fr_vocab_path) # Decode from standard input. sys.stdout.write("> ") sys.stdout.flush() sentence = sys.stdin.readline() while sentence: # Get token-ids for the input sentence. token_ids = data_utils.sentence_to_token_ids(tf.compat.as_bytes(sentence), en_vocab) # Which bucket does it belong to? bucket_id = min([b for b in xrange(len(_buckets)) if _buckets[b][0] > len(token_ids)]) # Get a 1-element batch to feed the sentence to the model. encoder_inputs, decoder_inputs, target_weights = model.get_batch( {bucket_id: [(token_ids, [])]}, bucket_id) # Get output logits for the sentence. _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) # This is a greedy decoder - outputs are just argmaxes of output_logits. outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits] # If there is an EOS symbol in outputs, cut them at that point. if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] # Print out French sentence corresponding to outputs. print(" ".join([tf.compat.as_str(rev_fr_vocab[output]) for output in outputs])) print("> ", end="") sys.stdout.flush() sentence = sys.stdin.readline()
def read_data(source_path, target_path, max_size=None): """Read data from source and target files and put into buckets. Args: source_path: path to the files with token-ids for the source language. target_path: path to the file with token-ids for the target language; it must be aligned with the source file: n-th line contains the desired output for n-th line from the source_path. max_size: maximum number of lines to read, all other will be ignored; if 0 or None, data files will be read completely (no limit). Returns: data_set: a list of length len(_buckets); data_set[n] contains a list of (source, target) pairs read from the provided data files that fit into the n-th bucket, i.e., such that len(source) < _buckets[n][0] and len(target) < _buckets[n][1]; source and target are lists of token-ids. """ data_set = [[] for _ in _buckets] with tf.gfile.GFile(source_path, mode="r") as source_file: with tf.gfile.GFile(target_path, mode="r") as target_file: source, target = source_file.readline(), target_file.readline() counter = 0 while source and target and (not max_size or counter < max_size): counter += 1 if counter % 1000 == 0: print(" reading data line %d" % counter) sys.stdout.flush() source_ids = [int(x) for x in source.split()] target_ids = [int(x) for x in target.split()] target_ids.append(data_utils.EOS_ID) for bucket_id, (source_size, target_size) in enumerate(_buckets): if len(source_ids) < source_size and len(target_ids) < target_size: data_set[bucket_id].append([source_ids, target_ids]) break source, target = source_file.readline(), target_file.readline() return data_set
def decode(): with tf.Session() as sess: # Create model and load parameters. model = create_model(sess, True) model.batch_size = 1 # We decode one sentence at a time. # Load vocabularies. enc_vocab_path = os.path.join(gConfig['working_directory'],"vocab%d.enc" % gConfig['enc_vocab_size']) dec_vocab_path = os.path.join(gConfig['working_directory'],"vocab%d.dec" % gConfig['dec_vocab_size']) enc_vocab, _ = data_utils.initialize_vocabulary(enc_vocab_path) _, rev_dec_vocab = data_utils.initialize_vocabulary(dec_vocab_path) # Decode from standard input. sys.stdout.write("> ") sys.stdout.flush() sentence = sys.stdin.readline() while sentence: # Get token-ids for the input sentence. token_ids = data_utils.sentence_to_token_ids(tf.compat.as_bytes(sentence), enc_vocab) # Which bucket does it belong to? bucket_id = min([b for b in xrange(len(_buckets)) if _buckets[b][0] > len(token_ids)]) # Get a 1-element batch to feed the sentence to the model. encoder_inputs, decoder_inputs, target_weights = model.get_batch( {bucket_id: [(token_ids, [])]}, bucket_id) # Get output logits for the sentence. _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) # This is a greedy decoder - outputs are just argmaxes of output_logits. outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits] # If there is an EOS symbol in outputs, cut them at that point. if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] # Print out French sentence corresponding to outputs. print(" ".join([tf.compat.as_str(rev_dec_vocab[output]) for output in outputs])) print("> ", end="") sys.stdout.flush() sentence = sys.stdin.readline()
def read_data(source_path, target_path, max_size=None): """Read data from source and target files and put into buckets. Args: source_path: path to the files with token-ids for the source language. target_path: path to the file with token-ids for the target language; it must be aligned with the source file: n-th line contains the desired output for n-th line from the source_path. max_size: maximum number of lines to read, all other will be ignored; if 0 or None, data files will be read completely (no limit). Returns: data_set: a list of length len(_buckets); data_set[n] contains a list of (source, target) pairs read from the provided data files that fit into the n-th bucket, i.e., such that len(source) < _buckets[n][0] and len(target) < _buckets[n][1]; source and target are lists of token-ids. """ data_set = [[] for _ in _buckets] with gfile.GFile(source_path, mode="r") as source_file: with gfile.GFile(target_path, mode="r") as target_file: source, target = source_file.readline(), target_file.readline() counter = 0 while source and target and (not max_size or counter < max_size): counter += 1 if counter % 100000 == 0: print(" reading data line %d" % counter) sys.stdout.flush() source_ids = [int(x) for x in source.split()] target_ids = [int(x) for x in target.split()] target_ids.append(data_utils.EOS_ID) for bucket_id, (source_size, target_size) in enumerate(_buckets): if len(source_ids) < source_size and len(target_ids) < target_size: data_set[bucket_id].append([source_ids, target_ids]) break source, target = source_file.readline(), target_file.readline() return data_set
def read_data(self, source_path, target_path, max_size=None): """Read data from source and target files and put into buckets. Args: source_path: path to the files with token-ids for the source language. target_path: path to the file with token-ids for the target language; it must be aligned with the source file: n-th line contains the desired output for n-th line from the source_path. max_size: maximum number of lines to read, all other will be ignored; if 0 or None, data files will be read completely (no limit). Returns: data_set: a list of length len(_buckets); data_set[n] contains a list of (source, target) pairs read from the provided data files that fit into the n-th bucket, i.e., such that len(source) < _buckets[n][0] and len(target) < _buckets[n][1]; source and target are lists of token-ids. """ data_set = [[] for _ in self._buckets] with tf.gfile.GFile(source_path, mode="r") as source_file: with tf.gfile.GFile(target_path, mode="r") as target_file: source, target = source_file.readline(), target_file.readline() counter = 0 while source and target and (not max_size or counter < max_size): counter += 1 if counter % 100000 == 0: print(" reading data line %d" % counter) sys.stdout.flush() source_ids = [int(x) for x in source.split()] target_ids = [int(x) for x in target.split()] target_ids.append(data_utils.EOS_ID) for bucket_id, (source_size, target_size) in enumerate(self._buckets): if len(source_ids) < source_size and len(target_ids) < target_size: data_set[bucket_id].append([source_ids, target_ids]) break source, target = source_file.readline(), target_file.readline() return data_set
def read_data(source_path, target_path, max_size=10000): """Read data from source and target files and put into buckets. Args: source_path: path to the files with token-ids for the source language. target_path: path to the file with token-ids for the target language; it must be aligned with the source file: n-th line contains the desired output for n-th line from the source_path. max_size: maximum number of lines to read, all other will be ignored; if 0 or None, data files will be read completely (no limit). Returns: data_set: a list of length len(_buckets); data_set[n] contains a list of (source, target) pairs read from the provided data files that fit into the n-th bucket, i.e., such that len(source) < _buckets[n][0] and len(target) < _buckets[n][1]; source and target are lists of token-ids. """ data_set = [[] for _ in _buckets] with tf.gfile.GFile(source_path, mode="r") as source_file: with tf.gfile.GFile(target_path, mode="r") as target_file: source, target = source_file.readline(), target_file.readline() counter = 0 while source and target and (not max_size or counter < max_size): counter += 1 if counter % 100000 == 0: print(" reading data line %d" % counter) sys.stdout.flush() source_ids = [int(x) for x in source.split()] target_ids = [int(x) for x in target.split()] target_ids.append(data_utils.EOS_ID) for bucket_id, (source_size, target_size) in enumerate(_buckets): if len(source_ids) < source_size and len(target_ids) < target_size: data_set[bucket_id].append([source_ids, target_ids]) break source, target = source_file.readline(), target_file.readline() return data_set
def read_data(source_path, target_path, max_size=None): """Read data from source and target files and put into buckets. Args: source_path: path to the files with token-ids for the source. target_path: path to the file with token-ids for the target; it must be aligned with the source file: n-th line contains the desired output for n-th line from the source_path. max_size: maximum number of lines to read, all other will be ignored; if 0 or None, data files will be read completely (no limit). Returns: data_set: a list of length len(_buckets); data_set[n] contains a list of (source, target) pairs read from the provided data files that fit into the n-th bucket, i.e., such that len(source) < _buckets[n][0] and len(target) < _buckets[n][1]; source and target are lists of token-ids. """ data_set = [[] for _ in _buckets] with gfile.GFile(source_path, mode="r") as source_file: with gfile.GFile(target_path, mode="r") as target_file: source, target = source_file.readline(), target_file.readline() counter = 0 while source and target and (not max_size or counter < max_size): counter += 1 if counter % 100000 == 0: print(" reading data line %d" % counter) sys.stdout.flush() source_ids = [int(x) for x in source.split()] target_ids = [int(x) for x in target.split()] target_ids.append(data_utils.EOS_ID) for bucket_id, (source_size, target_size) in enumerate(_buckets): if len(source_ids) < source_size and len(target_ids) < target_size: data_set[bucket_id].append([source_ids, target_ids]) break source, target = source_file.readline(), target_file.readline() return data_set
def decode(): """ Decode file sentence-by-sentence """ with tf.Session(config=tf.ConfigProto(intra_op_parallelism_threads=NUM_THREADS)) as sess: # Create model and load parameters. with tf.variable_scope("model"): model, steps_done = create_model(sess, True, attention=FLAGS.attention, model_path=FLAGS.model_path) model.batch_size = 1 # We decode one sentence at a time. # Load vocabularies. sents_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.sents" % FLAGS.input_vocab_size) parse_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.parse" % FLAGS.output_vocab_size) sents_vocab, _ = data_utils.initialize_vocabulary(sents_vocab_path) _, rev_parse_vocab = data_utils.initialize_vocabulary(parse_vocab_path) start_time = time.time() # Decode with open(FLAGS.decode_input_path, 'r') as fin, open(FLAGS.decode_output_path, 'w') as fout: for line in fin: sentence = line.strip() token_ids = data_utils.sentence_to_token_ids(tf.compat.as_bytes(sentence), sents_vocab) try: bucket_id = min([b for b in xrange(len(_buckets)) if _buckets[b][0] > len(token_ids)]) except: print("Input sentence does not fit in any buckets. Skipping... ") print("\t", line) continue encoder_inputs, decoder_inputs, target_weights = model.get_batch({bucket_id: [(token_ids, [])]}, bucket_id) _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs,target_weights, bucket_id, True) outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits] if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] decoded_sentence = " ".join([tf.compat.as_str(rev_parse_vocab[output]) for output in outputs]) + '\n' fout.write(decoded_sentence) time_elapsed = time.time() - start_time print("Decoding time: ", time_elapsed)
def plot_attn_mat(model_dev, all_examples): model_dev.batch_size = len(all_examples) token_ids = [x[0] for x in all_examples] gold_ids = [x[1] for x in all_examples] dec_ids = [[]] * len(token_ids) encoder_inputs, decoder_inputs, target_weights = model_dev.get_decode_batch( {bucket_id: zip(token_ids, dec_ids)}, bucket_id) _, _, output_logits, attns = model_dev.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) #_, _, output_logits, attns = model_dev.step_with_attn(sess, encoder_inputs, decoder_inputs,target_weights, bucket_id, True) outputs = [np.argmax(logit, axis=1) for logit in output_logits] to_decode = np.array(outputs).T sent_id = 0 parse = list(to_decode[sent_id, :]) parse_all = parse[:] if data_utils.EOS_ID in parse: parse = parse[:parse.index(data_utils.EOS_ID)] decoded_parse = [] decoded_parse_all = [] for output in parse: if output < len(rev_parse_vocab): decoded_parse.append(tf.compat.as_str(rev_parse_vocab[output])) else: decoded_parse.append("_UNK") for output in parse_all: if output < len(rev_parse_vocab): decoded_parse_all.append(tf.compat.as_str(rev_parse_vocab[output])) else: decoded_parse_all.append("_UNK") gold_parse = [tf.compat.as_str(rev_parse_vocab[output]) for output in gold_ids[sent_id]] sent_text = [tf.compat.as_str(rev_sent_vocab[output]) for output in token_ids[sent_id]] mat = attns[:,0,:].T return encoder_inputs, sent_text, gold_parse, decoded_parse, mat
def post(self): data_received = request.json if not data_received: data_received = eval(request.form["payload"]) sentence = data_received["text"] print(sentence) token_ids = data_utils.sentence_to_token_ids(tf.compat.as_bytes(sentence), en_vocab) # Which bucket does it belong to? bucket_id = len(_buckets) - 1 for i, bucket in enumerate(_buckets): if bucket[0] >= len(token_ids): bucket_id = i break else: logging.warning("Sentence truncated: %s", sentence) # Get a 1-element batch to feed the sentence to the model. encoder_inputs, decoder_inputs, target_weights = model.get_batch( {bucket_id: [(token_ids, [])]}, bucket_id) # Get output logits for the sentence. _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) # This is a greedy decoder - outputs are just argmaxes of output_logits. outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits] # If there is an EOS symbol in outputs, cut them at that point. if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] # Print out French sentence corresponding to outputs. response = (" ".join([tf.compat.as_str(rev_fr_vocab[output]) for output in outputs])) print(response) return jsonify({"text":response})