def test_regression(): # Check regression for various parameter settings. rng = check_random_state(0) X_train, X_test, y_train, y_test = train_test_split(boston.data[:50], boston.target[:50], random_state=rng) grid = ParameterGrid({"max_samples": [0.5, 1.0], "max_features": [0.5, 1.0], "bootstrap": [True, False], "bootstrap_features": [True, False]}) for base_estimator in [None, DummyRegressor(), DecisionTreeRegressor(), KNeighborsRegressor(), SVR()]: for params in grid: BaggingRegressor(base_estimator=base_estimator, random_state=rng, **params).fit(X_train, y_train).predict(X_test)
def run(self, grid_config): for classpath, parameter_config in grid_config.items(): try: module_name, class_name = classpath.rsplit(".", 1) module = importlib.import_module(module_name) cls = getattr(module, class_name) for parameters in ParameterGrid(parameter_config): try: cls(**parameters) except Exception as e: raise ValueError(dedent('''Section: grid_config - Unable to instantiate classifier {} with parameters {}, error thrown: {} '''.format(classpath, parameters, e))) except Exception as e: raise ValueError(dedent('''Section: grid_config - Unable to import classifier {}, error thrown: {} '''.format(classpath, e)))
def val_tune_rf(estimator,x_train,y_train,x_val,y_val,params): params_list = list(ParameterGrid(params)) print params_list print y_val results = [] for param in params_list: print '========= ',param estimator.set_params(**param) estimator.fit(x_train,y_train) preds_prob = estimator.predict_proba(x_val) # print preds_prob[:,1] result = roc_auc_score(y_val,preds_prob[:,1]) print 'roc_auc_score : %f'%result results.append((param,result)) results.sort(key=lambda k: k[1]) print results print results[-1]
def grid_search(): param_grid = {} param_grid["embeddings"] = [ ("data/pol/orth", "w2v_allwiki_nkjp300_300"), ("data/pol/lemma", "w2v_allwiki_nkjp300_300"), ("resources/pol/fasttext", "wiki.pl") ] param_grid["optim"] = ["adam", "adagrad"] param_grid['reweight'] = [True, False] grid = ParameterGrid(param_grid) filename = "results/{date:%Y%m%d_%H%M}_results.csv".format(date=datetime.now()) print('Starting a grid search through {n} parameter combinations'.format( n=len(grid))) for params in grid: print(params) with open(filename, "a") as results_file: results_file.write(str(params) + ", ") max_dev_epoch, max_dev, _ = train.main(params) results_file.write('Epoch {epoch}, accuracy {acc:.4f}\n'.format( epoch=max_dev_epoch, acc=max_dev ))
def test_classification(): # Check classification for various parameter settings. rng = check_random_state(0) X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state=rng) grid = ParameterGrid({"max_samples": [0.5, 1.0], "max_features": [1, 2, 4], "bootstrap": [True, False], "bootstrap_features": [True, False]}) for base_estimator in [None, DummyClassifier(), Perceptron(), DecisionTreeClassifier(), KNeighborsClassifier(), SVC()]: for params in grid: BaggingClassifier(base_estimator=base_estimator, random_state=rng, **params).fit(X_train, y_train).predict(X_test)
def _create_batches(self): param_iter = ParameterGrid(self.param_grid) # divide work into batches equal to the communicator's size work_batches = [[] for _ in range(comm_size)] i = 0 for fold_id, (train_index, test_index) in enumerate(self.cv_iter): for parameters in param_iter: work_batches[i % comm_size].append((fold_id + 1, train_index, test_index, parameters)) i += 1 return work_batches
def _generate_model_configs(self, grid_config): """Flattens a model/parameter grid configuration into individually trainable model/parameter pairs Yields: (tuple) classpath and parameters """ for class_path, parameter_config in grid_config.items(): for parameters in ParameterGrid(parameter_config): yield class_path, parameters
def fit(self, frame): """Fit the grid search. Parameters ---------- frame : H2OFrame, shape=(n_samples, n_features) The training frame on which to fit. """ return self._fit(frame, ParameterGrid(self.param_grid))
def fit(self, X, y=None): """Run fit with all sets of parameters. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vector, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] or [n_samples, n_output], optional Target relative to X for classification or regression; None for unsupervised learning. """ return self._fit(X, y, ParameterGrid(self.param_grid))
def __init__(self, experiment, args, job_module_config): super(self.__class__, self).__init__(experiment, args, job_module_config) # pre-format the experiment dict # Sklearn needs all the params to be in a list for the grid to work # properly for param in experiment['params']: if type(experiment['params'][param]) is not list: experiment['params'][param] = [experiment['params'][param] ] self.searcher = ParameterGrid(experiment['params'])
def create_parameter_grid(param_dict): from sklearn.model_selection import ParameterGrid return ParameterGrid(param_dict)
def _get_param_iterator(self): """Return ParameterGrid instance for the given param_grid""" return model_selection.ParameterGrid(self.param_grid) # ------------------ # # RandomizedSearchCV # # ------------------ #
def fit_binarized(self, X_featurized, Y_binarized, validation_data=None, **kwargs): klass = get_class_from_module_path(self.classifier) if validation_data is None: # use 0.2 for validation data X_train, X_validation, Y_train, Y_validation = train_test_split(X_featurized, Y_binarized, test_size=self.validation_size) logger.info('Using {} of training data ({} instances) for validation.'.format(self.validation_size, Y_validation.shape[0])) else: X_train, X_validation, Y_train, Y_validation = X_featurized, validation_data[0], Y_binarized, validation_data[1] #end if best_score, best_param = 0.0, None if self.n_jobs > 1: logger.info('Performing hyperparameter gridsearch in parallel using {} jobs.'.format(self.n_jobs)) else: logger.debug('Performing hyperparameter gridsearch in parallel using {} jobs.'.format(self.n_jobs)) param_scores = Parallel(n_jobs=self.n_jobs)(delayed(_fit_classifier)(klass, self.classifier_args, param, self.metric, X_train, Y_train, X_validation, Y_validation) for param in ParameterGrid(self.param_grid)) best_param, best_score = max(param_scores, key=lambda x: x[1]) logger.info('Best scoring param is {} with score {}.'.format(best_param, best_score)) classifier_args = {} classifier_args.update(self.classifier_args) classifier_args.update(best_param) self.classifier_ = klass(**classifier_args) logger.info('Fitting final model <{}> on full data with param {}.'.format(self.classifier_, best_param)) self.classifier_.fit(X_featurized, Y_binarized) return self #end def #end class
def test_parameter_grid(): # Test basic properties of ParameterGrid. params1 = {"foo": [1, 2, 3]} grid1 = ParameterGrid(params1) assert_true(isinstance(grid1, Iterable)) assert_true(isinstance(grid1, Sized)) assert_equal(len(grid1), 3) assert_grid_iter_equals_getitem(grid1) params2 = {"foo": [4, 2], "bar": ["ham", "spam", "eggs"]} grid2 = ParameterGrid(params2) assert_equal(len(grid2), 6) # loop to assert we can iterate over the grid multiple times for i in xrange(2): # tuple + chain transforms {"a": 1, "b": 2} to ("a", 1, "b", 2) points = set(tuple(chain(*(sorted(p.items())))) for p in grid2) assert_equal(points, set(("bar", x, "foo", y) for x, y in product(params2["bar"], params2["foo"]))) assert_grid_iter_equals_getitem(grid2) # Special case: empty grid (useful to get default estimator settings) empty = ParameterGrid({}) assert_equal(len(empty), 1) assert_equal(list(empty), [{}]) assert_grid_iter_equals_getitem(empty) assert_raises(IndexError, lambda: empty[1]) has_empty = ParameterGrid([{'C': [1, 10]}, {}, {'C': [.5]}]) assert_equal(len(has_empty), 4) assert_equal(list(has_empty), [{'C': 1}, {'C': 10}, {}, {'C': .5}]) assert_grid_iter_equals_getitem(has_empty)
def test_parameters_sampler_replacement(): # raise error if n_iter too large params = {'first': [0, 1], 'second': ['a', 'b', 'c']} sampler = ParameterSampler(params, n_iter=7) assert_raises(ValueError, list, sampler) # degenerates to GridSearchCV if n_iter the same as grid_size sampler = ParameterSampler(params, n_iter=6) samples = list(sampler) assert_equal(len(samples), 6) for values in ParameterGrid(params): assert_true(values in samples) # test sampling without replacement in a large grid params = {'a': range(10), 'b': range(10), 'c': range(10)} sampler = ParameterSampler(params, n_iter=99, random_state=42) samples = list(sampler) assert_equal(len(samples), 99) hashable_samples = ["a%db%dc%d" % (p['a'], p['b'], p['c']) for p in samples] assert_equal(len(set(hashable_samples)), 99) # doesn't go into infinite loops params_distribution = {'first': bernoulli(.5), 'second': ['a', 'b', 'c']} sampler = ParameterSampler(params_distribution, n_iter=7) samples = list(sampler) assert_equal(len(samples), 7)
def test_spectral_coclustering(): # Test Dhillon's Spectral CoClustering on a simple problem. param_grid = {'svd_method': ['randomized', 'arpack'], 'n_svd_vecs': [None, 20], 'mini_batch': [False, True], 'init': ['k-means++'], 'n_init': [10], 'n_jobs': [1]} random_state = 0 S, rows, cols = make_biclusters((30, 30), 3, noise=0.5, random_state=random_state) S -= S.min() # needs to be nonnegative before making it sparse S = np.where(S < 1, 0, S) # threshold some values for mat in (S, csr_matrix(S)): for kwargs in ParameterGrid(param_grid): model = SpectralCoclustering(n_clusters=3, random_state=random_state, **kwargs) model.fit(mat) assert_equal(model.rows_.shape, (3, 30)) assert_array_equal(model.rows_.sum(axis=0), np.ones(30)) assert_array_equal(model.columns_.sum(axis=0), np.ones(30)) assert_equal(consensus_score(model.biclusters_, (rows, cols)), 1) _test_shape_indices(model)
def _to_param_meta(param_grid, control): '''Acquire parameter metadata such as bounds that are useful for sampling''' choice_params = {k: v for k, v in param_grid.items() if not hasattr(v, 'rvs')} distributions = {k: v for k, v in param_grid.items() if k not in choice_params} pg_list = list(ParameterGrid(choice_params)) choices, low, high, param_order, is_int = [], [], [], [], [] is_continuous = lambda v: isinstance(v, numbers.Real) while len(pg_list): pg2 = pg_list.pop(0) for k, v in pg2.items(): if k in param_order: idx = param_order.index(k) else: idx = len(param_order) param_order.append(k) low.append(v) high.append(v) choices.append([v]) is_int.append(not is_continuous(v)) continue if v not in choices[idx]: choices[idx].append(v) if is_continuous(v): is_int[idx] = False if v < low[idx]: low[idx] = v if v > high[idx]: high[idx] = v else: is_int[idx] = True low[idx] = high[idx] = v for k, v in distributions.items(): choices.append(v) low.append(None) high.append(None) is_int.append(False) param_order.append(k) param_meta = dict(control=control, high=high, low=low, choices=choices, is_int=is_int, param_order=param_order) return param_meta
def clf_loop(self, X_train, X_test, y_train, y_test, individuals, setting): ''' Runs through each model specified by models_to_run once with each possible setting in params. ''' N = 0 self.prepare_report() for index, clf in enumerate([self.clfs[x] for x in self.models_to_run]): iteration = 0 print('Running {}.'.format(self.models_to_run[index])) parameter_values = self.params[self.models_to_run[index]] grid = ParameterGrid(parameter_values) while iteration < self.iterations_max and iteration < len(grid): print(' Running Iteration {} of {}...'.format(iteration + 1, self.iterations_max)) if len(grid) > self.iterations_max: p = random.choice(list(grid)) else: p = list(grid)[iteration] try: m = Model(clf, X_train, y_train, X_test, y_test, p, N, self.models_to_run[index], iteration, self.output_dir, thresholds = self.thresholds, ks = self.ks, report = self.report, label='label', individuals=individuals, setting=setting) m.run() print(' Printing to file...') if not self.roc: m.performance_to_file() else: m.performance_to_file(roc='{}ROC_{}_{}-{}.png'.format( self.output_dir, self.models_to_run[index], N, iteration)) except IndexError as e: print(p) print(N) print('IndexError: {}'.format(e)) print(traceback.format_exc()) continue except RuntimeError as e: print(p) print(N) print('RuntimeError: {}'.format(e)) print(traceback.format_exc()) continue except AttributeError as e: print(p) print(N) print('AttributeError: {}'.format(e)) print(traceback.format_exc()) continue iteration += 1 N += 1