def train(epoch): model.train() for batch_idx, (data, target) in enumerate(train_loader): if args.cuda: data, target = data.cuda(), target.cuda() data, target = Variable(data), Variable(target) optimizer.zero_grad() output = model(data) loss = F.nll_loss(output, target) loss.backward() optimizer.step() if batch_idx % args.log_interval == 0: print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( epoch, batch_idx * len(data), len(train_loader.dataset), 100. * batch_idx / len(train_loader), loss.data[0])) # 1. Save the model every epoch torch.save(model.state_dict(), "mnist_model_{0:03d}.pwf".format(epoch))
def save(obj, f, pickle_module=pickle, pickle_protocol=DEFAULT_PROTOCOL): """Saves an object to a disk file. Args: obj: saved object f: a file-like object (has to implement fileno that returns a file descriptor) or a string containing a file name pickle_module: module used for pickling metadata and objects pickle_protocol: can be specified to override the default protocol """ new_fd = False if isinstance(f, str): new_fd = True f = open(f, "wb") try: return _save(obj, f, pickle_module, pickle_protocol) finally: if new_fd: f.close()
def test_serialization(self): x = torch.randn(5, 5).cuda() y = torch.IntTensor(2, 5).fill_(0).cuda() q = [x, y, x, y.storage()] with tempfile.NamedTemporaryFile() as f: torch.save(q, f) f.seek(0) q_copy = torch.load(f) self.assertEqual(q_copy, q, 0) q_copy[0].fill_(5) self.assertEqual(q_copy[0], q_copy[2], 0) self.assertTrue(isinstance(q_copy[0], torch.cuda.DoubleTensor)) self.assertTrue(isinstance(q_copy[1], torch.cuda.IntTensor)) self.assertTrue(isinstance(q_copy[2], torch.cuda.DoubleTensor)) self.assertTrue(isinstance(q_copy[3], torch.cuda.IntStorage)) q_copy[1].fill_(10) self.assertTrue(q_copy[3], torch.cuda.IntStorage(10).fill_(10))
def __call__(self, test_case): module = self.constructor(*self.constructor_args) input = self._get_input() if self.reference_fn is not None: out = test_case._forward(module, input) if isinstance(out, Variable): out = out.data ref_input = self._unpack_input(deepcopy(input)) expected_out = self.reference_fn(ref_input, test_case._get_parameters(module)[0]) test_case.assertEqual(out, expected_out) # TODO: do this with in-memory files as soon as torch.save will support it with TemporaryFile() as f: test_case._forward(module, input) torch.save(module, f) f.seek(0) module_copy = torch.load(f) test_case.assertEqual(test_case._forward(module, input), test_case._forward(module_copy, input)) self._do_test(test_case, module, input)
def test_serialization(self): a = [torch.randn(5, 5).float() for i in range(2)] b = [a[i % 2] for i in range(4)] b += [a[0].storage()] b += [a[0].storage()[1:4]] for use_name in (False, True): with tempfile.NamedTemporaryFile() as f: handle = f if not use_name else f.name torch.save(b, handle) f.seek(0) c = torch.load(handle) self.assertEqual(b, c, 0) self.assertTrue(isinstance(c[0], torch.FloatTensor)) self.assertTrue(isinstance(c[1], torch.FloatTensor)) self.assertTrue(isinstance(c[2], torch.FloatTensor)) self.assertTrue(isinstance(c[3], torch.FloatTensor)) self.assertTrue(isinstance(c[4], torch.FloatStorage)) c[0].fill_(10) self.assertEqual(c[0], c[2], 0) self.assertEqual(c[4], torch.FloatStorage(25).fill_(10), 0) c[1].fill_(20) self.assertEqual(c[1], c[3], 0) self.assertEqual(c[4], c[5][1:4], 0)
def save_checkpoint(self, state, is_best): """ Save a copy of the model so that it can be loaded at a future date. This function is used when the model is being evaluated on the test data. Furthermore, the model with the highest accuracy is saved as with a special name. """ print("[*] Saving model to {}".format(self.ckpt_dir)) filename = self.get_model_name() + '_ckpt.pth.tar' ckpt_path = os.path.join(self.ckpt_dir, filename) torch.save(state, ckpt_path) if is_best: filename = self.get_model_name() + '_model_best.pth.tar' shutil.copyfile(ckpt_path, os.path.join(self.ckpt_dir, filename)) print("[*] ==== Best Valid Acc Achieved ====")
def save(self): data = { 'max_word_len': self._max_len, 'dict': { 'src': self.src_dict.word2idx, 'src_size': len(self.src_dict), 'tgt': self.tgt_dict.word2idx, 'tgt_size': len(self.tgt_dict) }, 'train': { 'src': corpora2idx(self.src_train, self.src_dict.word2idx), 'tgt': corpora2idx(self.tgt_train, self.tgt_dict.word2idx) }, 'valid': { 'src': corpora2idx(self.src_valid, self.src_dict.word2idx), 'tgt': corpora2idx(self.tgt_valid, self.tgt_dict.word2idx) } } torch.save(data, self._save_data) print('src corpora length - [{}] | target corpora length - [{}]'.format(len(self.src_dict), len(self.tgt_dict)))
def save(self): data = { 'trains_score': self.trains_score(), 'max_len': self.max_len, 'tag_size': len(self.tgt_dict), 'dict': { 'src': self.sent_dict.word2idx, 'vocab_size': len(self.sent_dict), 'tgt': self.tgt_dict }, 'train': { 'src': corpora2idx(self.src_sents, self.sent_dict.word2idx), 'label': corpora2idx(self.labels, self.tgt_dict), }, 'valid': { 'src': corpora2idx(self.valid_src_sents, self.sent_dict.word2idx), 'label': corpora2idx(self.valid_labels, self.tgt_dict), } } torch.save(data, self._save_data) print('Finish dumping the corora data to file - [{}]'.format(self._save_data)) print('words length - [{}]'.format(len(self.sent_dict)))
def save_now(self): if self._save_externally_triggered: # Reset trigger self._save_externally_triggered = False # Save if externally triggered return True elif self._is_iteration_with_best_validation_score: return self._save_at_best_validation_score else: # Check if we're saving by epoch if self._save_every is not None and self._save_every.by_epoch: # Don't save if we've already saved once this epoch if self._epoch_count == self._last_saved_at_epoch: return False else: # If we haven't saved this epoch, check if we should return self._save_every.match(epoch_count=self._epoch_count) else: # We're saving by iterations return self._save_every is not None and \ self._save_every.match(iteration_count=self._iteration_count)
def savemodel(self, epoch=None, model=None, opts=None, best_flag=False): # Note: if we add hook to the grad by using register_hook(hook), then the hook function can not be saved # so we need to save state_dict() only. Although save state dictionary is recommended, I still want to save # the whole model as it can save the structure of network too, thus we do not need to create a new network # next time. # model = utils.list2sequential(model).state_dict() # opts = opts.state_dict() if not os.path.isdir(self.save_path): os.mkdir(self.save_path) # self.check_point_params['model'] = utils.list2sequential(model).state_dict() self.check_point_params['model'] = model self.check_point_params['opts'] = opts self.check_point_params['resume_epoch'] = epoch torch.save(self.check_point_params, self.save_path+"checkpoint.pkl") if best_flag: # best_model = {'model': utils.list2sequential(model).state_dict()} best_model = {'model': model} torch.save(best_model, self.save_path+"best_model.pkl")
def main(): dicts = {} dicts["src"] = initVocabulary("source", opt.train_src, opt.src_vocab_size, opt.save_data + ".src.dict") dicts["tgt"] = initVocabulary("target", opt.train_tgt, opt.tgt_vocab_size, opt.save_data + ".tgt.dict") save_data = {} save_data["dicts"] = dicts save_data["train_xe"] = makeDataGeneral("train_xe", opt.train_xe_src, opt.train_xe_tgt, dicts) save_data["train_pg"] = makeDataGeneral("train_pg", opt.train_pg_src, opt.train_pg_tgt, dicts) save_data["valid"] = makeDataGeneral("valid", opt.valid_src, opt.valid_tgt, dicts) save_data["test"] = makeDataGeneral("test", opt.test_src, opt.test_tgt, dicts) print("Saving data to \"" + opt.save_data + "-train.pt\"...") torch.save(save_data, opt.save_data + "-train.pt")
def train_dice(args, epoch, model, trainLoader, optimizer, trainF, weights): model.train() nProcessed = 0 nTrain = len(trainLoader.dataset) for batch_idx, (data, target) in enumerate(trainLoader): if args.cuda: data, target = data.cuda(), target.cuda() data, target = Variable(data), Variable(target) optimizer.zero_grad() output = model(data) loss = bioloss.dice_loss(output, target) # make_graph.save('/tmp/t.dot', loss.creator); assert(False) loss.backward() optimizer.step() nProcessed += len(data) err = 100.*(1. - loss.data[0]) partialEpoch = epoch + batch_idx / len(trainLoader) - 1 print('Train Epoch: {:.2f} [{}/{} ({:.0f}%)]\tLoss: {:.8f}\tError: {:.8f}'.format( partialEpoch, nProcessed, nTrain, 100. * batch_idx / len(trainLoader), loss.data[0], err)) trainF.write('{},{},{}\n'.format(partialEpoch, loss.data[0], err)) trainF.flush()
def train(args, epoch, net, trainLoader, optimizer, trainF): net.train() nProcessed = 0 nTrain = len(trainLoader.dataset) for batch_idx, (data, target) in enumerate(trainLoader): if args.cuda: data, target = data.cuda(), target.cuda() data, target = Variable(data), Variable(target) optimizer.zero_grad() output = net(data) loss = F.nll_loss(output, target) # make_graph.save('/tmp/t.dot', loss.creator); assert(False) loss.backward() optimizer.step() nProcessed += len(data) pred = output.data.max(1)[1] # get the index of the max log-probability incorrect = pred.ne(target.data).cpu().sum() err = 100.*incorrect/len(data) partialEpoch = epoch + batch_idx / len(trainLoader) - 1 print('Train Epoch: {:.2f} [{}/{} ({:.0f}%)]\tLoss: {:.6f}\tError: {:.6f}'.format( partialEpoch, nProcessed, nTrain, 100. * batch_idx / len(trainLoader), loss.data[0], err)) trainF.write('{},{},{}\n'.format(partialEpoch, loss.data[0], err)) trainF.flush()
def save_checkpoint(self, save_path): """Save neural network and trainer states to disk.""" # self.net should be an instance of torch.nn.DataParallel module = self.net.module module.cpu() training_state_dict = { 'nn': module, 'optimizer': self.optimizer, 'meta_dict': self.meta } print('saving checkpoint to:', save_path) dirname = os.path.dirname(save_path) if not os.path.exists(dirname): os.makedirs(dirname) torch.save(training_state_dict, save_path) module.cuda(self._device_ids[0])
def save_checkpoint(self, state, is_best, filename='checkpoint.pth.tar'): if self._state('save_model_path') is not None: filename_ = filename filename = os.path.join(self.state['save_model_path'], filename_) if not os.path.exists(self.state['save_model_path']): os.makedirs(self.state['save_model_path']) print('save model {filename}'.format(filename=filename)) torch.save(state, filename) if is_best: filename_best = 'model_best.pth.tar' if self._state('save_model_path') is not None: filename_best = os.path.join(self.state['save_model_path'], filename_best) shutil.copyfile(filename, filename_best) if self._state('save_model_path') is not None: if self._state('filename_previous_best') is not None: os.remove(self._state('filename_previous_best')) filename_best = os.path.join(self.state['save_model_path'], 'model_best_{score:.4f}.pth.tar'.format(score=state['best_score'])) shutil.copyfile(filename, filename_best) self.state['filename_previous_best'] = filename_best
def epoch(self, epoch_index): if not self.keep_old_checkpoints: self._clear(self.last_pattern.format('*', '*')) torch.save( self.trainer.model.state_dict(), os.path.join( self.checkpoints_path, self.last_pattern.format(epoch_index, self.trainer.iterations) ) ) cur_val_loss = self.trainer.stats['validation_loss']['last'] if cur_val_loss < self._best_val_loss: self._clear(self.best_pattern.format('*', '*')) torch.save( self.trainer.model.state_dict(), os.path.join( self.checkpoints_path, self.best_pattern.format( epoch_index, self.trainer.iterations ) ) ) self._best_val_loss = cur_val_loss
def save_checkpoint(exp_name, epoch, model_state_dict, optimizer_state_dict): ''' save the trained model as checkpoint ''' state = { 'exp_name': exp_name, 'epoch': epoch, 'state_dict': model_state_dict, 'optimizer' : optimizer_state_dict, } filename = str(epoch) + '.pth.tar' save_path = os.path.join(const.OUTPUT_DIR, exp_name, filename) torch.save(state, save_path) return
def save(obj, f, pickle_module=pickle, pickle_protocol=DEFAULT_PROTOCOL): """Saves an object to a disk file. See also: :ref:`recommend-saving-models` Args: obj: saved object f: a file-like object (has to implement fileno that returns a file descriptor) or a string containing a file name pickle_module: module used for pickling metadata and objects pickle_protocol: can be specified to override the default protocol """ new_fd = False if isinstance(f, str) or (sys.version_info[0] == 2 and isinstance(f, unicode)): new_fd = True f = open(f, "wb") try: return _save(obj, f, pickle_module, pickle_protocol) finally: if new_fd: f.close()
def test_serialization_array_with_storage(self): x = torch.randn(5, 5).cuda() y = torch.IntTensor(2, 5).fill_(0).cuda() q = [x, y, x, y.storage()] with tempfile.NamedTemporaryFile() as f: torch.save(q, f) f.seek(0) q_copy = torch.load(f) self.assertEqual(q_copy, q, 0) q_copy[0].fill_(5) self.assertEqual(q_copy[0], q_copy[2], 0) self.assertTrue(isinstance(q_copy[0], torch.cuda.DoubleTensor)) self.assertTrue(isinstance(q_copy[1], torch.cuda.IntTensor)) self.assertTrue(isinstance(q_copy[2], torch.cuda.DoubleTensor)) self.assertTrue(isinstance(q_copy[3], torch.cuda.IntStorage)) q_copy[1].fill_(10) self.assertTrue(q_copy[3], torch.cuda.IntStorage(10).fill_(10))
def test_multigpu_serialization_remap(self): x = [torch.randn(4, 4).cuda(0), torch.randn(4, 4).cuda(1)] def gpu_remap(storage, location): if location == 'cuda:1': return storage.cuda(0) with tempfile.NamedTemporaryFile() as f: torch.save(x, f) f.seek(0) x_copy = torch.load(f, map_location=gpu_remap) for original, copy in zip(x, x_copy): self.assertEqual(copy, original) self.assertIs(type(copy), type(original)) self.assertEqual(copy.get_device(), 0)
def __call__(self, test_case): module = self.constructor(*self.constructor_args) input = self._get_input() if self.reference_fn is not None: out = test_case._forward(module, input) if isinstance(out, Variable): out = out.data ref_input = self._unpack_input(deepcopy(input)) expected_out = self.reference_fn(ref_input, test_case._get_parameters(module)[0]) test_case.assertEqual(out, expected_out) self.test_noncontig(test_case, module, input) # TODO: do this with in-memory files as soon as torch.save will support it with TemporaryFile() as f: test_case._forward(module, input) torch.save(module, f) f.seek(0) module_copy = torch.load(f) test_case.assertEqual(test_case._forward(module, input), test_case._forward(module_copy, input)) self._do_test(test_case, module, input)
def main(**kwargs): opt.parse(kwargs) model = getattr(models,opt.model)(opt).cuda().eval() if opt.model_path is not None: model.load(opt.model_path) opt.parse(kwargs) model = model.eval() test_data_title,test_data_content,index2qid,labels=load_data(type_=opt.type_) Num=len(test_data_title) print "Num: ",Num result=np.zeros((Num,1999)) for i in tqdm.tqdm(range(Num)): if i%opt.batch_size==0 and i>0: # import ipdb;ipdb.set_trace() title=np.array(test_data_title[i-opt.batch_size:i]) content=np.array(test_data_content[i-opt.batch_size:i]) result[i-opt.batch_size:i,:]=dotest(model,title,content) if Num%opt.batch_size!=0: title=np.array(test_data_title[opt.batch_size*(Num/opt.batch_size):]) content=np.array(test_data_content[opt.batch_size*(Num/opt.batch_size):]) result[opt.batch_size*(Num/opt.batch_size):,:]=dotest(model,title,content) t.save(t.from_numpy(result).float(),opt.result_path)
def get_optimizer(self,lr1,lr2=0,weight_decay = 0): ignored_params = list(map(id, self.encoder.parameters())) base_params = filter(lambda p: id(p) not in ignored_params, self.parameters()) if lr2 is None: lr2 = lr1*0.5 optimizer = t.optim.Adam([ dict(params=base_params,weight_decay = weight_decay,lr=lr1), {'params': self.encoder.parameters(), 'lr': lr2} ]) return optimizer # def save2(self,name=None): # prefix = 'checkpoints/' + self.model_name + '_' # if name is None: # name = time.strftime('%m%d_%H:%M:%S.pth') # path = prefix+name # data = {'opt':self.opt.state_dict(),'d':self.state_dict()} # t.save(data, path) # return path # # def load2(self,path): # # data = t.load(path) # # self.__init__(data['opt']) # # self.load_state_dict(data['d'])
def test_one(file,data_): test_data_title,test_data_content,index2qid,labels = data_ opt.model_path = file model= models.MultiModelAll4zhihu(opt).cuda().eval() Num=len(test_data_title[0]) result=np.zeros((Num,1999)) for i in tqdm.tqdm(range(Num)): if i%opt.batch_size==0 and i>0: title=np.array(test_data_title[0][i-opt.batch_size:i]),np.array(test_data_title[1][i-opt.batch_size:i]) content=np.array(test_data_content[0][i-opt.batch_size:i]),np.array(test_data_content[1][i-opt.batch_size:i]) result[i-opt.batch_size:i,:]=dotest(model,title,content) if Num%opt.batch_size!=0: title=np.array(test_data_title[0][opt.batch_size*(Num/opt.batch_size):]),np.array(test_data_title[1][opt.batch_size*(Num/opt.batch_size):]) content=np.array(test_data_content[0][opt.batch_size*(Num/opt.batch_size):]) ,np.array(test_data_content[1][opt.batch_size*(Num/opt.batch_size):]) result[opt.batch_size*(Num/opt.batch_size):,:]=dotest(model,title,content) # t.save(t.from_numpy(result).float(),opt.result_path) return t.from_numpy(result).float()
def __init__(self): super(VGG16, self).__init__() # We need to save those and do the forward step manually to be able to keep every in-between steps self.conv1_1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1) self.conv1_2 = nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1) self.conv2_1 = nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1) self.conv2_2 = nn.Conv2d(128, 128, kernel_size=3, stride=1, padding=1) self.conv3_1 = nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1) self.conv3_2 = nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1) self.conv3_3 = nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1) self.conv4_1 = nn.Conv2d(256, 512, kernel_size=3, stride=1, padding=1) self.conv4_2 = nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1) self.conv4_3 = nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1) self.conv5_1 = nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1) self.conv5_2 = nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1) self.conv5_3 = nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1) # Remove gradients since we won't need them for p in self.parameters(): p.requires_grad = False
def torch_to_pytorch(model, t7_file, output): py_layers = [] for layer in list(model.children()): py_layer_serial(layer, py_layers) t7_data = torchfile.load(t7_file) t7_layers = [] for layer in t7_data: torch_layer_serial(layer, t7_layers) j = 0 for i, py_layer in enumerate(py_layers): py_name = type(py_layer).__name__ t7_layer = t7_layers[j] t7_name = t7_layer[0].split('.')[-1] if layer_map[t7_name] != py_name: raise RuntimeError('%s does not match %s' % (py_name, t7_name)) if py_name == 'LSTM': n_layer = 2 if py_layer.bidirectional else 1 n_layer *= py_layer.num_layers t7_layer = t7_layers[j:j + n_layer] j += n_layer else: j += 1 load_params(py_layer, t7_layer) torch.save(model.state_dict(), output)
def test(epoch): global best_acc net.eval() test_loss = 0 correct = 0 total = 0 for batch_idx, (inputs, targets) in enumerate(testloader): if use_cuda: inputs, targets = inputs.cuda(), targets.cuda() inputs, targets = Variable(inputs, volatile=True), Variable(targets) outputs = net(inputs) loss = criterion(outputs, targets) test_loss += loss.data[0] _, predicted = torch.max(outputs.data, 1) total += targets.size(0) correct += predicted.eq(targets.data).cpu().sum() progress_bar(batch_idx, len(testloader), 'Loss: %.3f | Acc: %.3f%% (%d/%d)' % (test_loss/(batch_idx+1), 100.*correct/total, correct, total)) # Save checkpoint. acc = 100.*correct/total if acc > best_acc: print('Saving..') state = { 'net': net.module if use_cuda else net, 'acc': acc, 'epoch': epoch, } # if not os.path.isdir('checkpoint'): # os.mkdir('checkpoint') # torch.save(state, './checkpoint/ckpt.t7') best_acc = acc return acc
def train(args): # Setup Dataloader data_loader = get_loader(args.dataset) data_path = get_data_path(args.dataset) loader = data_loader(data_path, is_transform=True, img_size=(args.img_rows, args.img_cols)) n_classes = loader.n_classes trainloader = data.DataLoader(loader, batch_size=args.batch_size, num_workers=4, shuffle=True) # Setup visdom for visualization if args.visdom: vis = visdom.Visdom() loss_window = vis.line(X=torch.zeros((1,)).cpu(), Y=torch.zeros((1)).cpu(), opts=dict(xlabel='minibatches', ylabel='Loss', title='Training Loss', legend=['Loss'])) # Setup Model model = get_model(args.arch, n_classes) model = torch.nn.DataParallel(model, device_ids=range(torch.cuda.device_count())) model.cuda() optimizer = torch.optim.SGD(model.parameters(), lr=args.l_rate, momentum=0.99, weight_decay=5e-4) for epoch in range(args.n_epoch): for i, (images, labels) in enumerate(trainloader): images = Variable(images.cuda()) labels = Variable(labels.cuda()) optimizer.zero_grad() outputs = model(images) loss = cross_entropy2d(outputs, labels) loss.backward() optimizer.step() if args.visdom: vis.line( X=torch.ones((1, 1)).cpu() * i, Y=torch.Tensor([loss.data[0]]).unsqueeze(0).cpu(), win=loss_window, update='append') if (i+1) % 20 == 0: print("Epoch [%d/%d] Loss: %.4f" % (epoch+1, args.n_epoch, loss.data[0])) torch.save(model, "{}_{}_{}_{}.pkl".format(args.arch, args.dataset, args.feature_scale, epoch))
def save_checkpoint(state, is_best, save_path, filename): filename = os.path.join(save_path, filename) torch.save(state, filename) if is_best: bestname = os.path.join(save_path, 'model_best.pth.tar') shutil.copyfile(filename, bestname)
def save_checkpoint(state, is_best, filename, bestname): torch.save(state, filename) if is_best: shutil.copyfile(filename, bestname)
def train_epochs(model, loss_fn, init_lr, model_dir): if os.path.exists(model_dir): shutil.rmtree(model_dir) os.makedirs(model_dir) optimizer = optim.Adam(model.parameters(), lr = init_lr) # setup the optimizer learning_rate = init_lr max_iter = 5 start_halfing_iter = 2 halfing_factor = 0.1 count = 0 half_flag = False while count < max_iter: count += 1 if count >= start_halfing_iter: half_flag = True print ("Starting epoch", count) if half_flag: learning_rate *= halfing_factor adjust_learning_rate(optimizer, halfing_factor) # decay learning rate model_path = model_dir + '/epoch' + str(count) + '_lr' + str(learning_rate) + '.pkl' train_one_epoch(model, loss_fn, optimizer) # train one epoch torch.save(model.state_dict(), model_path) print ("End training")
def save_checkpoint(state, save_path, is_best=False, max_keep=None): # save checkpoint torch.save(state, save_path) # deal with max_keep save_dir = os.path.dirname(save_path) list_path = os.path.join(save_dir, 'latest_checkpoint') save_path = os.path.basename(save_path) if os.path.exists(list_path): with open(list_path) as f: ckpt_list = f.readlines() ckpt_list = [save_path + '\n'] + ckpt_list else: ckpt_list = [save_path + '\n'] if max_keep is not None: for ckpt in ckpt_list[max_keep:]: ckpt = os.path.join(save_dir, ckpt[:-1]) if os.path.exists(ckpt): os.remove(ckpt) ckpt_list[max_keep:] = [] with open(list_path, 'w') as f: f.writelines(ckpt_list) # copy best if is_best: shutil.copyfile(save_path, os.path.join(save_dir, 'best_model.ckpt'))
def main(arg): # initialize datasets for training and v_datasets for validation resize = (200, 66) datasets, v_dataset = init_datasets(arg, resize, 3) # initialize models cuda = th.cuda.is_available() models = init_models(arg.model, 3, arg.lr, arg.restore, cuda) # Initiate Training t0 = datetime.datetime.now() try: t_losses = [[],[],[]] v_losses = [] for e in range(arg.epochs): i = 0 for (model, opt), dataset in zip(models, datasets): print('training model %d' % i) losses = train(e, model, opt, dataset, arg, cuda=cuda) th.save(model.state_dict(), './save/%s%d_%s.p' % (arg.model, i, e)) t_losses[i] += losses v = validate([[model, None]], v_dataset, arg, cuda=cuda) print('[!] model %d - validation loss: %s' % (i, v)) i += 1 v_loss = validate(models, v_dataset, arg, cuda=cuda) v_losses.append(v_loss) print('[valid] [e]:%s - [loss]:%s' % (e, v_loss)) except KeyboardInterrupt: print('[!] KeyboardInterrupt: Stopped Training...') # save stuffs pkl.dump(t_losses, open('./save/%s_t_loss.p' % arg.model, 'wb')) pkl.dump(v_losses, open('./save/%s_v_loss.p' % arg.model, 'wb')) for i, (m, _) in enumerate(iter(models)): th.save(m.state_dict(), './save/%s%d.p' % (arg.model, i)) t1 = datetime.datetime.now() print('[!] Finished Training, Time Taken4 %s' % (t1-t0))
def save(self, file_name): torch.save(self.net.state_dict(), file_name)
def val(self): # validation _, val_reward, _ = self._loop(is_train=False) self._writer.add_scalar("val_reward", sum(val_reward), self._step) if self.weight_dir is not None: self.agent.save(os.path.join(self.weight_dir, f"{self._step}.pkl"))
def train(epoch): color_model.train() try: for batch_idx, (data, classes) in enumerate(train_loader): messagefile = open('./message.txt', 'a') original_img = data[0].unsqueeze(1).float() img_ab = data[1].float() if have_cuda: original_img = original_img.cuda() img_ab = img_ab.cuda() classes = classes.cuda() original_img = Variable(original_img) img_ab = Variable(img_ab) classes = Variable(classes) optimizer.zero_grad() class_output, output = color_model(original_img, original_img) ems_loss = torch.pow((img_ab - output), 2).sum() / torch.from_numpy(np.array(list(output.size()))).prod() cross_entropy_loss = 1/300 * F.cross_entropy(class_output, classes) loss = ems_loss + cross_entropy_loss lossmsg = 'loss: %.9f\n' % (loss.data[0]) messagefile.write(lossmsg) ems_loss.backward(retain_variables=True) cross_entropy_loss.backward() optimizer.step() if batch_idx % 500 == 0: message = 'Train Epoch:%d\tPercent:[%d/%d (%.0f%%)]\tLoss:%.9f\n' % ( epoch, batch_idx * len(data), len(train_loader.dataset), 100. * batch_idx / len(train_loader), loss.data[0]) messagefile.write(message) torch.save(color_model.state_dict(), 'colornet_params.pkl') messagefile.close() # print('Train Epoch: {}[{}/{}({:.0f}%)]\tLoss: {:.9f}\n'.format( # epoch, batch_idx * len(data), len(train_loader.dataset), # 100. * batch_idx / len(train_loader), loss.data[0])) except Exception: logfile = open('log.txt', 'w') logfile.write(traceback.format_exc()) logfile.close() finally: torch.save(color_model.state_dict(), 'colornet_params.pkl')
def train(model, db, args, bsz=32, eph=1, use_cuda=False): print("Training...") trainloader = data_utils.DataLoader(dataset=db, batch_size=bsz, shuffle=True) criterion = torch.nn.CrossEntropyLoss() optimizer = optim.SGD(model.parameters(), lr=1e-4, momentum=0.9) best_loss = 100000 for epoch in range(eph): running_loss = 0.0 for i, data in enumerate(trainloader, 1): inputs, targets = data inputs = inputs.unsqueeze(1) targets = target_onehot_to_classnum_tensor(targets) if use_cuda and cuda_ava: inputs = Variable(inputs.float().cuda()) targets = Variable(targets.cuda()) else: inputs = Variable(inputs.float()) targets = Variable(targets) optimizer.zero_grad() outputs = model(inputs) loss = criterion(outputs, targets) loss.backward() optimizer.step() running_loss += loss.data[0] last_loss = loss.data[0] if i % 100 == 0: print("[%d, %5d] loss: %.3f" % (epoch + 1, i, running_loss / 100)) running_loss = 0 if last_loss < best_loss: best_loss = last_loss acc = evaluate(model, trainloader, use_cuda) torch.save(model.state_dict(), os.path.join('saved_model', 'cnnT1_epoch_{}_iter_{}_loss_{}_acc_{}_{}.t7'.format(epoch + 1, i, last_loss, acc, datetime.datetime.now().strftime("%b_%d_%H:%M:%S")))) acc = evaluate(model, trainloader, use_cuda) torch.save(model.state_dict(), os.path.join('saved_model', 'cnnT1_all_acc_{}.t7'.format(acc))) print("Finished Training!")
def load_word_vectors(path): if os.path.isfile(path + '.pth') and os.path.isfile(path + '.vocab'): print('==> File found, loading to memory') vectors = torch.load(path + '.pth') vocab = Vocab(filename=path + '.vocab') return vocab, vectors # saved file not found, read from txt file # and create tensors for word vectors print('==> File not found, preparing, be patient') count = sum(1 for line in open(path + '.txt')) with open(path + '.txt', 'r') as f: contents = f.readline().rstrip('\n').split(' ') dim = len(contents[1:]) words = [None] * (count) vectors = torch.zeros(count, dim) with open(path + '.txt', 'r') as f: idx = 0 for line in f: contents = line.rstrip('\n').split(' ') words[idx] = contents[0] vectors[idx] = torch.Tensor(list(map(float, contents[1:]))) idx += 1 with open(path + '.vocab', 'w') as f: for word in words: f.write(word + '\n') vocab = Vocab(filename=path + '.vocab') torch.save(vectors, path + '.pth') return vocab, vectors # write unique words from a set of files to a new file
def save(self, label): pass # helper saving function that can be used by subclasses
def save_network(self, network, network_label, epoch_label, gpu_ids): save_filename = '%s_net_%s.pth' % (epoch_label, network_label) save_path = os.path.join(self.save_dir, save_filename) torch.save(network.cpu().state_dict(), save_path) if len(gpu_ids) and torch.cuda.is_available(): network.cuda(device_id=gpu_ids[0]) # helper loading function that can be used by subclasses
def load(f, map_location=None, pickle_module=pickle): """Loads an object saved with torch.save from a disk file. torch.load can dynamically remap storages to be loaded on a different device using the map_location argument. If it's a callable, it will be called with two arguments: storage and location tag. It's expected to either return a storage that's been moved to a different location, or None (and the location will be resolved using the default method). If this argument is a dict it's expected to be a mapping from location tags used in a file, to location tags of the current system. By default the location tags are 'cpu' for host tensors and 'cuda:device_id' (e.g. 'cuda:2') for cuda tensors. User extensions can register their own tagging and deserialization methods using register_package. Args: f: a file-like object (has to implement fileno that returns a file descriptor) or a string containing a file name map_location: a function or a dict specifying how to remap storage locations pickle_module: module used for unpickling metadata and objects (has to match the pickle_module used to serialize file) """ new_fd = False if isinstance(f, str): new_fd = True f = open(f, 'rb') try: return _load(f, map_location, pickle_module) finally: if new_fd: f.close()
def test_serialization(self): x = torch.randn(4, 4).cuda() with tempfile.NamedTemporaryFile() as f: torch.save(x, f) f.seek(0) x_copy = torch.load(f) self.assertEqual(x_copy, x) self.assertIs(type(x_copy), type(x)) self.assertEqual(x_copy.get_device(), x.get_device())
def test_serialization_empty(self): x = [torch.randn(4, 4).cuda(), torch.cuda.FloatTensor()] with tempfile.NamedTemporaryFile() as f: torch.save(x, f) f.seek(0) x_copy = torch.load(f) for original, copy in zip(x, x_copy): self.assertEqual(copy, original) self.assertIs(type(copy), type(original)) self.assertEqual(copy.get_device(), original.get_device())
def test_multigpu_serialization_remap_dict(self): x = [torch.randn(4, 4).cuda(0), torch.randn(4, 4).cuda(1)] with tempfile.NamedTemporaryFile() as f: torch.save(x, f) f.seek(0) x_copy = torch.load(f, map_location={'cuda:1': 'cuda:0'}) for original, copy in zip(x, x_copy): self.assertEqual(copy, original) self.assertIs(type(copy), type(original)) self.assertEqual(copy.get_device(), 0)
def test_serialization_container(self): def import_module(name, filename): if sys.version_info >= (3, 5): import importlib.util spec = importlib.util.spec_from_file_location(name, filename) module = importlib.util.module_from_spec(spec) spec.loader.exec_module(module) else: import imp module = imp.load_source(name, filename) sys.modules[module.__name__] = module return module with tempfile.NamedTemporaryFile() as checkpoint: module = import_module('tmpmodule', 'data/network1.py') torch.save(module.Net(), checkpoint) # First check that the checkpoint can be loaded without warnings checkpoint.seek(0) with warnings.catch_warnings(record=True) as w: loaded = torch.load(checkpoint) self.assertTrue(isinstance(loaded, module.Net)) self.assertEquals(len(w), 0) # Replace the module with different source module = import_module('tmpmodule', 'data/network2.py') checkpoint.seek(0) with warnings.catch_warnings(record=True) as w: loaded = torch.load(checkpoint) self.assertTrue(isinstance(loaded, module.Net)) self.assertEquals(len(w), 1) self.assertTrue(w[0].category, 'SourceChangeWarning')