我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用data_utils.initialize_vocabulary()。
def init_session(sess, conf='seq2seq.ini'): global gConfig gConfig = get_config(conf) # Create model and load parameters. model = create_model(sess, True) model.batch_size = 1 # We decode one sentence at a time. # Load vocabularies. enc_vocab_path = os.path.join(gConfig['working_directory'],"vocab%d.enc" % gConfig['enc_vocab_size']) dec_vocab_path = os.path.join(gConfig['working_directory'],"vocab%d.dec" % gConfig['dec_vocab_size']) enc_vocab, _ = data_utils.initialize_vocabulary(enc_vocab_path) _, rev_dec_vocab = data_utils.initialize_vocabulary(dec_vocab_path) return sess, model, enc_vocab, rev_dec_vocab
def init_session(sess, conf='seq2seq.ini'): global gConfig gConfig = get_config(conf) print(gConfig) # Create model and load parameters. model = create_model(sess, True) model.batch_size = 1 # We decode one sentence at a time. # Load vocabularies. enc_vocab_path = os.path.join(gConfig['working_directory'],"vocab%d.enc" % gConfig['enc_vocab_size']) dec_vocab_path = os.path.join(gConfig['working_directory'],"vocab%d.dec" % gConfig['dec_vocab_size']) enc_vocab, _ = data_utils.initialize_vocabulary(enc_vocab_path) _, rev_dec_vocab = data_utils.initialize_vocabulary(dec_vocab_path) return sess, model, enc_vocab, rev_dec_vocab
def main(_): print("Loading vocabulary") cn_vocab_path = os.path.join(FLAGS.data_dir, "source_vocab.txt") en_vocab_path = os.path.join(FLAGS.data_dir, "target_vocab.txt") cn_vocab, _ = data_utils.initialize_vocabulary(cn_vocab_path) _, rev_en_vocab = data_utils.initialize_vocabulary(en_vocab_path) print("Building model...") config = tf.ConfigProto(allow_soft_placement=True) with tf.Session(config=config) as sess: model = create_model(sess, False) # Decode from standard input. sys.stdout.write("> ") sys.stdout.flush() sentence = sys.stdin.readline() while sentence: seg_list = jieba.lcut(sentence.strip()) #print(" ".join(seg_list)) token_ids = [cn_vocab.get(w.encode(encoding="utf-8"), data_utils.UNK_ID) for w in seg_list] #print(token_ids) outputs = model.test(sess, token_ids) outputs = outputs.tolist() if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] output = " ".join([tf.compat.as_str(rev_en_vocab[output]) for output in outputs]) print(output.capitalize()) print("> ") sys.stdout.flush() sentence = sys.stdin.readline()
def decode(): with tf.Session() as sess: # Create model and load parameters. model = create_model(sess, True) model.batch_size = 1 # We decode one sentence at a time. # Load vocabularies. src_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.source" % FLAGS.src_vocab_size) tar_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.target" % FLAGS.tar_vocab_size) src_vocab, _ = data_utils.initialize_vocabulary(src_vocab_path) _, rev_tar_vocab = data_utils.initialize_vocabulary(tar_vocab_path) # Decode from standard input. sys.stdout.write("> ") sys.stdout.flush() sentence = sys.stdin.readline() tokenizer = data_utils.basic_char_tokenizer if FLAGS.char else None while sentence: # Get token-ids for the input sentence. token_ids = data_utils.sentence_to_token_ids(tf.compat.as_bytes(sentence), src_vocab, tokenizer) # Which bucket does it belong to? bucket_id = min([b for b in xrange(len(_buckets)) if _buckets[b][0] > len(token_ids)]) # Get a 1-element batch to feed the sentence to the model. encoder_inputs, decoder_inputs, target_weights = model.get_batch( {bucket_id: [(token_ids, [])]}, bucket_id) # Get output logits for the sentence. _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) # This is a greedy decoder - outputs are just argmaxes of output_logits. outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits] # If there is an EOS symbol in outputs, cut them at that point. if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] # Print out target-text sentence corresponding to outputs. print(("" if FLAGS.char else " ").join([tf.compat.as_str(rev_tar_vocab[output]) for output in outputs])) print("> ", end="") sys.stdout.flush() sentence = sys.stdin.readline()
def decode(): with tf.Session() as sess: # Create model and load parameters. model = create_model(sess, True) model.batch_size = 1 # We decode one sentence at a time. # Load vocabularies. _, _, _, _, en_vocab_path, fr_vocab_path = data_utils.prepare_my_data( FLAGS.data_dir, FLAGS.en_vocab_size, FLAGS.fr_vocab_size) en_vocab, _ = data_utils.initialize_vocabulary(en_vocab_path) _, rev_fr_vocab = data_utils.initialize_vocabulary(fr_vocab_path) # Decode from standard input. sys.stdout.write("> ") sys.stdout.flush() sentence = sys.stdin.readline() while sentence: # Get token-ids for the input sentence. token_ids = data_utils.sentence_to_token_ids(tf.compat.as_bytes(sentence), en_vocab) # Which bucket does it belong to? bucket_id = min([b for b in xrange(len(_buckets)) if _buckets[b][0] > len(token_ids)]) # Get a 1-element batch to feed the sentence to the model. encoder_inputs, decoder_inputs, target_weights = model.get_batch( {bucket_id: [(token_ids, [])]}, bucket_id) # Get output logits for the sentence. _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) # This is a greedy decoder - outputs are just argmaxes of output_logits. outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits] # If there is an EOS symbol in outputs, cut them at that point. if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] # Print out French sentence corresponding to outputs. print(" ".join([tf.compat.as_str(rev_fr_vocab[output]) for output in outputs])) print("> ", end="") sys.stdout.flush() sentence = sys.stdin.readline()
def decode(): with tf.Session() as sess: # Create model and load parameters. model = create_model(sess, True) model.batch_size = 1 # We decode one sentence at a time. # Load vocabularies. enc_vocab_path = os.path.join(gConfig['working_directory'],"vocab%d.enc" % gConfig['enc_vocab_size']) dec_vocab_path = os.path.join(gConfig['working_directory'],"vocab%d.dec" % gConfig['dec_vocab_size']) enc_vocab, _ = data_utils.initialize_vocabulary(enc_vocab_path) _, rev_dec_vocab = data_utils.initialize_vocabulary(dec_vocab_path) # Decode from standard input. sys.stdout.write("> ") sys.stdout.flush() sentence = sys.stdin.readline() while sentence: # Get token-ids for the input sentence. token_ids = data_utils.sentence_to_token_ids(tf.compat.as_bytes(sentence), enc_vocab) # Which bucket does it belong to? bucket_id = min([b for b in xrange(len(_buckets)) if _buckets[b][0] > len(token_ids)]) # Get a 1-element batch to feed the sentence to the model. encoder_inputs, decoder_inputs, target_weights = model.get_batch( {bucket_id: [(token_ids, [])]}, bucket_id) # Get output logits for the sentence. _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) # This is a greedy decoder - outputs are just argmaxes of output_logits. outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits] # If there is an EOS symbol in outputs, cut them at that point. if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] # Print out French sentence corresponding to outputs. print(" ".join([tf.compat.as_str(rev_dec_vocab[output]) for output in outputs])) print("> ", end="") sys.stdout.flush() sentence = sys.stdin.readline()
def decode(): """ Decode file sentence-by-sentence """ with tf.Session(config=tf.ConfigProto(intra_op_parallelism_threads=NUM_THREADS)) as sess: # Create model and load parameters. with tf.variable_scope("model"): model, steps_done = create_model(sess, True, attention=FLAGS.attention, model_path=FLAGS.model_path) model.batch_size = 1 # We decode one sentence at a time. # Load vocabularies. sents_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.sents" % FLAGS.input_vocab_size) parse_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.parse" % FLAGS.output_vocab_size) sents_vocab, _ = data_utils.initialize_vocabulary(sents_vocab_path) _, rev_parse_vocab = data_utils.initialize_vocabulary(parse_vocab_path) start_time = time.time() # Decode with open(FLAGS.decode_input_path, 'r') as fin, open(FLAGS.decode_output_path, 'w') as fout: for line in fin: sentence = line.strip() token_ids = data_utils.sentence_to_token_ids(tf.compat.as_bytes(sentence), sents_vocab) try: bucket_id = min([b for b in xrange(len(_buckets)) if _buckets[b][0] > len(token_ids)]) except: print("Input sentence does not fit in any buckets. Skipping... ") print("\t", line) continue encoder_inputs, decoder_inputs, target_weights = model.get_batch({bucket_id: [(token_ids, [])]}, bucket_id) _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs,target_weights, bucket_id, True) outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits] if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] decoded_sentence = " ".join([tf.compat.as_str(rev_parse_vocab[output]) for output in outputs]) + '\n' fout.write(decoded_sentence) time_elapsed = time.time() - start_time print("Decoding time: ", time_elapsed)
def decode(): with tf.Session() as sess: # Create model and load parameters. model = create_model(sess, True) model.batch_size = 1 # We decode one sentence at a time. # Load vocabularies. en_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.en" % FLAGS.en_vocab_size) fr_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.fr" % FLAGS.fr_vocab_size) en_vocab, _ = data_utils.initialize_vocabulary(en_vocab_path) _, rev_fr_vocab = data_utils.initialize_vocabulary(fr_vocab_path) # Decode from standard input. sys.stdout.write("> ") sys.stdout.flush() sentence = sys.stdin.readline() while sentence: # Get token-ids for the input sentence. token_ids = data_utils.sentence_to_token_ids(tf.compat.as_bytes(sentence), en_vocab) # Which bucket does it belong to? bucket_id = min([b for b in xrange(len(_buckets)) if _buckets[b][0] > len(token_ids)]) # Get a 1-element batch to feed the sentence to the model. encoder_inputs, decoder_inputs, target_weights = model.get_batch( {bucket_id: [(token_ids, [])]}, bucket_id) # Get output logits for the sentence. _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) # This is a greedy decoder - outputs are just argmaxes of output_logits. outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits] # If there is an EOS symbol in outputs, cut them at that point. if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] # Print out French sentence corresponding to outputs. print(" ".join([tf.compat.as_str(rev_fr_vocab[output]) for output in outputs])) print("> ", end="") sys.stdout.flush() sentence = sys.stdin.readline()
def prepare_data(config): train_path = os.path.join(config.train_dir, "chitchat.train") data_path_list = [train_path + ".answer", train_path + ".query"] vocab_path = os.path.join(config.train_dir, "vocab%d.all" % config.vocab_size) data_utils.create_vocabulary(vocab_path, data_path_list, config.vocab_size) vocab, rev_vocab = data_utils.initialize_vocabulary(vocab_path) # # if os.path.isfile(config.dev_set) and os.path.isfile(config.train_set): # dev_set_file = open(config.dev_set, "rb") # dev_set = pickle.load(dev_set_file) # dev_set_file.close() # # train_set_file = open(config.train_set, "rb") # train_set = pickle.load(train_set_file) # train_set_file.close() # else: print("Prepare Chitchat data in %s" % config.train_dir) train_query, train_answer, dev_query, dev_answer = data_utils.prepare_chitchat_data( config.train_dir, vocab, config.vocab_size) print("Reading development and training data (limit: %d)." % config.max_train_data_size) dev_set = read_data(config, dev_query, dev_answer) train_set = read_data(config, train_query, train_answer) # dev_set_file = open(config.dev_set, "wb") # pickle.dump(dev_set, dev_set_file) # dev_set_file.close() # # train_set_file = open(config.train_set, "wb") # pickle.dump(train_set, train_set_file) # train_set_file.close() return vocab, rev_vocab, dev_set, train_set
def decode(): gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.2) config = tf.ConfigProto(gpu_options=gpu_options) with tf.Session(config=config) as sess: model = create_model(sess, True) model.batch_size = 1 enc_vocab_path = os.path.join(working_directory,"vocab%d.enc" % enc_vocab_size) dec_vocab_path = os.path.join(working_directory,"vocab%d.dec" % dec_vocab_size) enc_vocab, _ = data_utils.initialize_vocabulary(enc_vocab_path) _, rev_dec_vocab = data_utils.initialize_vocabulary(dec_vocab_path) sys.stdout.write("> ") sys.stdout.flush() sentence = sys.stdin.readline() while sentence: token_ids = data_utils.sentence_to_token_ids(tf.compat.as_bytes(sentence), enc_vocab) bucket_id = min([b for b in xrange(len(_buckets)) if _buckets[b][0] > len(token_ids)]) encoder_inputs, decoder_inputs, target_weights = model.get_batch( {bucket_id: [(token_ids, [])]}, bucket_id) _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits] if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] print(" ".join([tf.compat.as_str(rev_dec_vocab[output]) for output in outputs])) print("> ", end="") sys.stdout.flush() sentence = sys.stdin.readline()
def decode(): gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.2) config = tf.ConfigProto(gpu_options=gpu_options) with tf.Session(config=config) as sess: model = create_model(sess, True) model.batch_size = 1 enc_vocab_path = os.path.join(working_directory,"vocab%d.enc" % enc_vocab_size) dec_vocab_path = os.path.join(working_directory,"vocab%d.dec" % dec_vocab_size) enc_vocab, _ = data_utils.initialize_vocabulary(enc_vocab_path) _, rev_dec_vocab = data_utils.initialize_vocabulary(dec_vocab_path) print('Start chatting...') @bot.message_handler(func=lambda sentence: True) def reply_all(message): sentence = (message.text).lower() token_ids = data_utils.sentence_to_token_ids(tf.compat.as_bytes(sentence), enc_vocab) bucket_id = min([b for b in xrange(len(_buckets)) if _buckets[b][0] > len(token_ids)]) encoder_inputs, decoder_inputs, target_weights = model.get_batch( {bucket_id: [(token_ids, [])]}, bucket_id) _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits] if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] message_text = " ".join([tf.compat.as_str(rev_dec_vocab[output]) for output in outputs]) bot.reply_to(message, message_text) while True: try: bot.polling(none_stop=True) except Exception as ex: print(str(ex)) bot.stop_polling() time.sleep(5) bot.polling()
def decode(): gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.2) config = tf.ConfigProto(gpu_options=gpu_options) with tf.Session(config=config) as sess: model = create_model(sess, True) model.batch_size = 1 enc_vocab_path = os.path.join(working_directory,"vocab%d.enc" % enc_vocab_size) dec_vocab_path = os.path.join(working_directory,"vocab%d.dec" % dec_vocab_size) enc_vocab, _ = data_utils.initialize_vocabulary(enc_vocab_path) _, rev_dec_vocab = data_utils.initialize_vocabulary(dec_vocab_path) sys.stdout.write("> ") sys.stdout.flush() sentence = sys.stdin.readline() while sentence: token_ids = data_utils.sentence_to_token_ids(tf.compat.as_bytes(sentence), enc_vocab) bucket_id = min([b for b in xrange(len(_buckets)) if _buckets[b][0] > len(token_ids)]) encoder_inputs, decoder_inputs, target_weights = model.get_batch( {bucket_id: [(token_ids, [])]}, bucket_id) _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits] if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] if sentence[:-1] in lines: temp_output = " ".join([tf.compat.as_str(rev_dec_vocab[output]) for output in outputs]) trigger_check = trigger_activator(temp_output) if trigger_check == True: print(" ".join([tf.compat.as_str(rev_dec_vocab[output]) for output in outputs[:-1]])) else: print(temp_output) else: print('i dont understand you') print("> ", end="") sys.stdout.flush() sentence = sys.stdin.readline() #Check if there is a trigger in the decoded sentence
def __init__(self): self.sess = tf.Session() self.download_trained_if_not_exists() # Create model and load parameters. self.model = create_model(self.sess, True) self.model.batch_size = 1 # We decode one sentence at a time. # Load vocabularies. en_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.en" % FLAGS.en_vocab_size) fr_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.fr" % FLAGS.fr_vocab_size) self.en_vocab, _ = data_utils.initialize_vocabulary(en_vocab_path) _, self.rev_fr_vocab = data_utils.initialize_vocabulary(fr_vocab_path)
def decode(): with tf.Session() as sess: # Create model and load parameters. model = create_model(sess, True) model.batch_size = 1 # We decode one sentence at a time. # Load vocabularies. enc_vocab_path = os.path.join(gConfig['working_directory'],"vocab%d_enc.txt" % gConfig['enc_vocab_size']) dec_vocab_path = os.path.join(gConfig['working_directory'],"vocab%d_dec.txt" % gConfig['dec_vocab_size']) enc_vocab, _ = data_utils.initialize_vocabulary(enc_vocab_path) _, rev_dec_vocab = data_utils.initialize_vocabulary(dec_vocab_path) # Decode sentence and store it with open(gConfig["test_enc"], 'r') as test_enc: with open(gConfig["output"], 'w') as predicted_headline: sentence_count = 0 for sentence in test_enc: # Get token-ids for the input sentence. token_ids = data_utils.sentence_to_token_ids(sentence, enc_vocab) # Which bucket does it belong to? And place the sentence to the last bucket if its token length is larger then X. bucket_id = min([b for b in range(len(_buckets)) if _buckets[b][0] > len(token_ids)] + [len(_buckets)-1]) # Get a 1-element batch to feed the sentence to the model. encoder_inputs, decoder_inputs, target_weights = model.get_batch( {bucket_id: [(token_ids, [])]}, bucket_id) # Get output logits for the sentence. _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) # This is a greedy decoder - outputs are just argmaxes of output_logits. outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits] # If there is an EOS symbol in outputs, cut them at that point. if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] # Write predicted headline corresponding to article. predicted_headline.write(" ".join([tf.compat.as_str(rev_dec_vocab[output]) for output in outputs])+'\n') sentence_count += 1 if sentence_count % 100 == 0: print("predicted data line %d" % sentence_count) sys.stdout.flush() predicted_headline.close() test_enc.close() print("Finished decoding and stored predicted results in %s!" % gConfig["output"])
def decode_input(): with tf.Session() as sess: # Create model and load parameters. model = create_model(sess, True) model.batch_size = 1 # We decode one sentence at a time. # Load vocabularies. enc_vocab_path = os.path.join(gConfig['working_directory'],"vocab%d_enc.txt" % gConfig['enc_vocab_size']) dec_vocab_path = os.path.join(gConfig['working_directory'],"vocab%d_dec.txt" % gConfig['dec_vocab_size']) enc_vocab, _ = data_utils.initialize_vocabulary(enc_vocab_path) _, rev_dec_vocab = data_utils.initialize_vocabulary(dec_vocab_path) # Decode from standard input. sys.stdout.write("> ") sys.stdout.flush() sentence = sys.stdin.readline() while sentence: # Get token-ids for the input sentence. token_ids = data_utils.sentence_to_token_ids(sentence, enc_vocab) # Which bucket does it belong to? And place the sentence to the last bucket if its token length is larger then the bucket length. bucket_id = min([b for b in range(len(_buckets)) if _buckets[b][0] > len(token_ids)] + [len(_buckets)-1]) # Get a 1-element batch to feed the sentence to the model. encoder_inputs, decoder_inputs, target_weights = model.get_batch( {bucket_id: [(token_ids, [])]}, bucket_id) # Get output logits for the sentence. _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) # This is a greedy decoder - outputs are just argmaxes of output_logits. outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits] # If there is an EOS symbol in outputs, cut them at that point. if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] # Print out French sentence corresponding to outputs. print(" ".join([tf.compat.as_str(rev_dec_vocab[output]) for output in outputs])) print("> ", end="") sys.stdout.flush() sentence = sys.stdin.readline()
def decode(): with tf.Session() as sess: # Create model and load parameters. model = create_model(sess, True) model.batch_size = 1 # We decode one sentence at a time. # Load vocabularies. en_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.en" % FLAGS.en_vocab_size) fr_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.fr" % FLAGS.fr_vocab_size) en_vocab, _ = data_utils.initialize_vocabulary(en_vocab_path) _, rev_fr_vocab = data_utils.initialize_vocabulary(fr_vocab_path) # Decode from standard input. sys.stdout.write("> ") sys.stdout.flush() sentence = sys.stdin.readline() while sentence: # Get token-ids for the input sentence. token_idsgb = data_utils.sentence_to_token_ids(sentence, en_vocab) # Truncate to the maximum bucket size token_ids = token_idsgb[0:119] # Which bucket does it belong to? bucket_id = min([b for b in xrange(len(_buckets)) if _buckets[b][0] > len(token_ids)]) # Get a 1-element batch to feed the sentence to the model. encoder_inputs, decoder_inputs, target_weights = model.get_batch( {bucket_id: [(token_ids, [])]}, bucket_id) # Get output logits for the sentence. _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) # This is a greedy decoder - outputs are just argmaxes of output_logits. outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits] # If there is an EOS symbol in outputs, cut them at that point. if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] # Print out French sentence corresponding to outputs. print(" ".join([rev_fr_vocab[output] for output in outputs])) print("> ", end="") sys.stdout.flush() sentence = sys.stdin.readline()
def decode(): with tf.Session() as sess: # Create model and load parameters. model = create_model(sess, True) model.batch_size = 1 # We decode one sentence at a time. # Load vocabularies. en_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.en" % FLAGS.en_vocab_size) fr_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.fr" % FLAGS.fr_vocab_size) en_vocab, _ = data_utils.initialize_vocabulary(en_vocab_path) _, rev_fr_vocab = data_utils.initialize_vocabulary(fr_vocab_path) # Decode from standard input. sys.stdout.write("> ") sys.stdout.flush() sentence = sys.stdin.readline() while sentence: # Get token-ids for the input sentence. token_idsgb = data_utils.sentence_to_token_ids(sentence, en_vocab) # Truncate sentence to the maximum bucket size token_ids = token_idsgb[0:479] # Which bucket does it belong to? bucket_id = min([b for b in xrange(len(_buckets)) if _buckets[b][0] > len(token_ids)]) # Get a 1-element batch to feed the sentence to the model. encoder_inputs, decoder_inputs, target_weights = model.get_batch( {bucket_id: [(token_ids, [])]}, bucket_id) # Get output logits for the sentence. _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) # This is a greedy decoder - outputs are just argmaxes of output_logits. outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits] # If there is an EOS symbol in outputs, cut them at that point. if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] # Print out French sentence corresponding to outputs. print(" ".join([rev_fr_vocab[output] for output in outputs])) print("> ", end="") sys.stdout.flush() sentence = sys.stdin.readline()
def decode(): # Only allocate part of the gpu memory when predicting. gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.2) config = tf.ConfigProto(gpu_options=gpu_options) with tf.Session(config=config) as sess: # Create model and load parameters. model = create_model(sess, True) model.batch_size = 1 # We decode one sentence at a time. # Load vocabularies. enc_vocab_path = os.path.join(gConfig['working_directory'],"vocab%d.enc" % gConfig['enc_vocab_size']) dec_vocab_path = os.path.join(gConfig['working_directory'],"vocab%d.dec" % gConfig['dec_vocab_size']) enc_vocab, _ = data_utils.initialize_vocabulary(enc_vocab_path) _, rev_dec_vocab = data_utils.initialize_vocabulary(dec_vocab_path) # Decode from standard input. sys.stdout.write("> ") sys.stdout.flush() sentence = sys.stdin.readline() while sentence: # Get token-ids for the input sentence. token_ids = data_utils.sentence_to_token_ids(tf.compat.as_bytes(sentence), enc_vocab) # Which bucket does it belong to? bucket_id = min([b for b in xrange(len(_buckets)) if _buckets[b][0] > len(token_ids)]) # Get a 1-element batch to feed the sentence to the model. encoder_inputs, decoder_inputs, target_weights = model.get_batch( {bucket_id: [(token_ids, [])]}, bucket_id) # Get output logits for the sentence. _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) # This is a greedy decoder - outputs are just argmaxes of output_logits. outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits] # If there is an EOS symbol in outputs, cut them at that point. if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] # Print out French sentence corresponding to outputs. print(" ".join([tf.compat.as_str(rev_dec_vocab[output]) for output in outputs])) print("> ", end="") sys.stdout.flush() sentence = sys.stdin.readline()
def decode(): with tf.Session() as sess: # Create model and load parameters. model = create_model(sess, True) model.batch_size = 1 # We decode one sentence at a time. # Load vocabularies. en_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.from" % FLAGS.from_vocab_size) fr_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.to" % FLAGS.to_vocab_size) en_vocab, _ = data_utils.initialize_vocabulary(en_vocab_path) _, rev_fr_vocab = data_utils.initialize_vocabulary(fr_vocab_path) # Decode from standard input. sys.stdout.write("> ") sys.stdout.flush() sentence = sys.stdin.readline() while sentence: # Get token-ids for the input sentence. token_ids = data_utils.sentence_to_token_ids(tf.compat.as_bytes(sentence), en_vocab) # Which bucket does it belong to? bucket_id = len(_buckets) - 1 for i, bucket in enumerate(_buckets): if bucket[0] >= len(token_ids): bucket_id = i break else: logging.warning("Sentence truncated: %s", sentence) # Get a 1-element batch to feed the sentence to the model. encoder_inputs, decoder_inputs, target_weights = model.get_batch( {bucket_id: [(token_ids, [])]}, bucket_id) # Get output logits for the sentence. _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) # This is a greedy decoder - outputs are just argmaxes of output_logits. outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits] # If there is an EOS symbol in outputs, cut them at that point. if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] # Print out French sentence corresponding to outputs. print(" ".join([tf.compat.as_str(rev_fr_vocab[output]) for output in outputs])) print("> ", end="") sys.stdout.flush() sentence = sys.stdin.readline()
def decode(): with tf.Session() as sess: # Create model and load parameters. model = create_model(sess, True) model.batch_size = 1 # We decode one sentence at a time. # Load vocabularies. en_vocab_path = os.path.join(FLAGS.data_dir, "vocab") fr_vocab_path = os.path.join(FLAGS.data_dir, "vocab") en_vocab, _ = data_utils.initialize_vocabulary(en_vocab_path) _, rev_fr_vocab = data_utils.initialize_vocabulary(fr_vocab_path) # Decode from standard input. sys.stdout.write("> ") sys.stdout.flush() sentence = sys.stdin.readline() while sentence: # Get token-ids for the input sentence. token_ids = data_utils.sentence_to_token_ids(tf.compat.as_bytes(sentence), en_vocab) # Which bucket does it belong to? bucket_id = len(_buckets) - 1 for i, bucket in enumerate(_buckets): if bucket[0] >= len(token_ids): bucket_id = i break else: logging.warning("Sentence truncated: %s", sentence) # Get a 1-element batch to feed the sentence to the model. encoder_inputs, decoder_inputs, target_weights = model.get_batch( {bucket_id: [(token_ids, [])]}, bucket_id) # Get output logits for the sentence. _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) # This is a greedy decoder - outputs are just argmaxes of output_logits. outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits] # If there is an EOS symbol in outputs, cut them at that point. if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] # Print out French sentence corresponding to outputs. print(" ".join([tf.compat.as_str(rev_fr_vocab[output]) for output in outputs])) print("> ", end="") sys.stdout.flush() sentence = sys.stdin.readline()
def decode(): with tf.Session() as sess: # Create model and load parameters. model = create_model(sess, True) model.batch_size = 1 # We decode one sentence at a time. # Load vocabularies. first_vocab_path = os.path.join(FLAGS.train_dir, "vocab%d.first" % FLAGS.first_vocab_size) last_vocab_path = os.path.join(FLAGS.train_dir, "vocab%d.last" % FLAGS.last_vocab_size) first_vocab, _ = data_utils.initialize_vocabulary(first_vocab_path) _, rev_last_vocab = data_utils.initialize_vocabulary(last_vocab_path) # Decode from standard input. sys.stdout.write("> ") sys.stdout.flush() sentence = FLAGS.input # Get token-ids for the input sentence. token_ids = data_utils.sentence_to_token_ids(sentence, first_vocab) # Which bucket does it belong to? bucket_id = min([b for b in xrange(len(_buckets)) if _buckets[b][0] > len(token_ids)]) # Get a 1-element batch to feed the sentence to the model. encoder_inputs, decoder_inputs, target_weights = model.get_batch( {bucket_id: [(token_ids, [])]}, bucket_id) # Get output logits for the sentence. _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) # This is a greedy decoder - outputs are just argmaxes of output_logits. outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits] # If there is an EOS symbol in outputs, cut them at that point. if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] # Print out French sentence corresponding to outputs. result = (" ".join([rev_last_vocab[output] for output in outputs])) print(result) output = os.path.join(FLAGS.output_dir, str(int(time.time())) + ".txt") with open(output, "w") as text_file: text_file.write(result) print(output) sys.stdout.flush()
def decode(): config_ = tf.ConfigProto() config_.gpu_options.allow_growth = True config_.allow_soft_placement = True with tf.Session(config=config_) as sess: # Create model and load parameters. model = create_model(sess, True, False) model.batch_size = 1 # We decode one sentence at a time. # Load vocabularies. from_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.from" % FLAGS.from_vocab_size) to_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.to" % FLAGS.to_vocab_size) from_vocab, _ = data_utils.initialize_vocabulary(from_vocab_path) _, rev_to_vocab = data_utils.initialize_vocabulary(to_vocab_path) # Decode from standard input. sys.stdout.write("> ") sys.stdout.flush() sentence = sys.stdin.readline() while sentence: # Get token-ids for the input sentence. token_ids = data_utils.sentence_to_token_ids(sentence, from_vocab) print(token_ids) # Which bucket does it belong to? bucket_id = len(_buckets) - 1 for i, bucket in enumerate(_buckets): if bucket[0] >= len(token_ids): bucket_id = i break else: logging.warning("Sentence truncated: %s", sentence) # Get a 1-element batch to feed the sentence to the model. encoder_inputs, decoder_inputs, target_weights = model.get_batch( {bucket_id: [(token_ids, [])]}, bucket_id) # Get output logits for the sentence. _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) # This is a greedy decoder - outputs are just argmaxes of output_logits. outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits] # If there is an EOS symbol in outputs, cut them at that point. if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] # Print out logical form corresponding to outputs. print(" ".join([tf.compat.as_str(rev_to_vocab[output]) for output in outputs])) print("> ", end="") sys.stdout.flush() sentence = sys.stdin.readline()
def decode(): with tf.Session() as sess: # Create model and load parameters. model = create_model(sess, True) model.batch_size = 1 # We decode one sentence at a time. # Load vocabularies. en_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.en" % FLAGS.en_vocab_size) amr_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.amr" % FLAGS.fr_vocab_size) en_vocab, _ = data_utils.initialize_vocabulary(en_vocab_path) _, rev_fr_vocab = data_utils.initialize_vocabulary(amr_vocab_path) # Decode from standard input. sys.stdout.write("> ") sys.stdout.flush() sentence = sys.stdin.readline() while sentence: # Get token-ids for the input sentence. token_idsgb = data_utils.sentence_to_token_ids(sentence, en_vocab, normalize_digits=True) # Truncate to the maximum bucket size token_ids = token_idsgb[0:119] # Which bucket does it belong to? bucket_id = min([b for b in xrange(len(_buckets)) if _buckets[b][0] > len(token_ids)]) # Get a 1-element batch to feed the sentence to the model. encoder_inputs, decoder_inputs, target_weights = model.get_batch( {bucket_id: [(token_ids, [])]}, bucket_id) # Get output logits for the sentence. _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) # print(output_logits) # This is a greedy decoder - outputs are just argmaxes of output_logits. outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits] # If there is an EOS symbol in outputs, cut them at that point. if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] # Print out French sentence corresponding to outputs. # print("Length: %s" % (len(outputs))) print(" ".join([rev_fr_vocab[output] for output in outputs])) print("> ", end="") sys.stdout.flush() sentence = sys.stdin.readline()
def decode(): ''' Manually input sentence interactively and the headline will be printed out ''' with tf.Session() as sess: # Create model and load parameters. model = create_model(sess, True) model.batch_size = FLAGS.batch_size #repeat single sentence 10 times as one batch # We decode one sentence at a time. # Load vocabularies. vocab_path = os.path.join(FLAGS.data_dir,"vocab") vocab, rev_vocab = data_utils.initialize_vocabulary(vocab_path) # Decode from standard input interactively sys.stdout.write("> ") sys.stdout.flush() #?????????????? sentence = sys.stdin.readline() while sentence: if (len(sentence.strip('\n')) == 0): sys.stdout.flush() sentence = sys.stdin.readline() # Get token-ids for the input sentence. token_ids = data_utils.sentence_to_token_ids(tf.compat.as_bytes(sentence), vocab) # print (token_ids) # print token ids # Which bucket does it belong to? bucket_id = min([b for b in xrange(len(buckets)) if buckets[b][0] > len(token_ids)]) # Get a 1-element batch to feed the sentence to the model. # print ("current bucket id" + str(bucket_id)) encoder_inputs, decoder_inputs, target_weights = model.get_batch( {bucket_id: [(token_ids, [])]}, bucket_id) # Get output logits for the sentence. _, _, output_logits_batch = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) # This is a greedy decoder - outputs are just argmaxes of output_logits. output_logits = [] for item in output_logits_batch: output_logits.append(item[0]) #print (output_logits) #print (len(output_logits)) #print (output_logits[0]) outputs = [int(np.argmax(logit)) for logit in output_logits] # If there is an EOS symbol in outputs, cut them at that point. if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] print(" ".join([tf.compat.as_str(rev_vocab[output]) for output in outputs])) print("> ", end="") sys.stdout.flush() sentence = sys.stdin.readline()
def evaluate(): with open (os.path.join(FLAGS.data_dir, 'feature.test'), 'rb') as f: feature = cPickle.load(f) with open(os.path.join(FLAGS.data_dir, 'caption.test'), 'rb') as f: sentence = cPickle.load(f) with open (os.path.join(FLAGS.data_dir, 'video.length'), 'rb') as f: length = cPickle.load(f) scorers = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), (Meteor(),"METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr")] vocab, re_vocab = data_utils.initialize_vocabulary() GTS = {} RES = {} batch_size = 1 max_meteor = 0 with tf.Session() as sess: model = Seq2Seq(FLAGS.num_units, FLAGS.use_lstm, FLAGS.encoder_max_sequence_length, 1, FLAGS.feature_size, FLAGS.vocab_size, FLAGS.learning_rate, FLAGS.learning_rate_decay_factor, FLAGS.max_gradient_norm, forward_only=True) step = 0 while True: step += FLAGS.steps_per_checkpoint ckpt_path = os.path.join(FLAGS.checkpoint_dir,'ckpt-%d'%step) if os.path.isfile(ckpt_path+'.meta'): model.saver.restore(sess, ckpt_path) cg = CaptionGenerator(model=model,start_id=data_utils.GO_ID,end_id=data_utils.EOS_ID, beam_size=3, max_caption_length=FLAGS.decoder_max_sentence_length, length_normalization_factor=0.0) for vid, _ in feature.iteritems(): feature_inputs, feature_lengths, batch_decoder_inputs, batch_weights = model.get_batch(feature, [(vid, [0])], length) sen = cg.beam_search(sess, feature_inputs, feature_lengths) sen = " ".join([tf.compat.as_str(re_vocab[w]) for w in sen]) print ("%s: %s"%(sen, sentence[vid][9])) GTS[vid] = sentence[vid] RES[vid] = [sen] print('STEP: %d'%step) for scorer, method in scorers: score, scores = scorer.compute_score(GTS, RES) if method == "METEOR" and score > max_meteor: max_meteor = score if isinstance(method, list): for k, v in zip(method, score): print("%s:\t%f"%(k, v)) else: print("%s:\t%f"%(method, score)) sys.stdout.flush() else: break print("Max METEOR:\t%f"%max_meteor)
def evaluate(): with open (os.path.join(FLAGS.data_dir, 'feature.test'), 'rb') as f: feature = cPickle.load(f) with open(os.path.join(FLAGS.data_dir, 'caption.test'), 'rb') as f: sentence = cPickle.load(f) with open (os.path.join(FLAGS.data_dir, 'video.length'), 'rb') as f: length = cPickle.load(f) scorers = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), (Meteor(),"METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr")] vocab, re_vocab = data_utils.initialize_vocabulary() GTS = {} RES = {} batch_size = 1 max_meteor = 0 with tf.Session() as sess: model = Seq2Seq(FLAGS.num_units, FLAGS.use_lstm, FLAGS.encoder_max_sequence_length, FLAGS.decoder_max_sentence_length, FLAGS.feature_size, FLAGS.vocab_size, FLAGS.learning_rate, FLAGS.learning_rate_decay_factor, FLAGS.max_gradient_norm, forward_only=True) step = 0 while True: step += FLAGS.steps_per_checkpoint ckpt_path = os.path.join(FLAGS.checkpoint_dir,'ckpt-%d'%step) if os.path.isfile(ckpt_path+'.meta'): model.saver.restore(sess, ckpt_path) for vid, _ in feature.iteritems(): feature_inputs, feature_lengths, batch_decoder_inputs, batch_weights = model.get_batch(feature, [(vid, [0])], length) output_logits = model.step(sess, feature_inputs, feature_lengths, batch_decoder_inputs, batch_weights, forward_only=True) outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits] if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] sen = " ".join([tf.compat.as_str(re_vocab[output]) for output in outputs]) print ("%s: %s"%(sen, sentence[vid][9])) GTS[vid] = sentence[vid] RES[vid] = [sen] print('STEP: %d'%step) for scorer, method in scorers: score, scores = scorer.compute_score(GTS, RES) if method == "METEOR" and score > max_meteor: max_meteor = score if isinstance(method, list): for k, v in zip(method, score): print("%s:\t%f"%(k, v)) else: print("%s:\t%f"%(method, score)) sys.stdout.flush() else: break print("Max METEOR:\t%f"%max_meteor)