Python scipy.stats 模块,entropy() 实例源码

我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用scipy.stats.entropy()

项目:treecat    作者:posterior    | 项目源码 | 文件源码
def multinomial_entropy(probs, count):
    """Compute entropy of multinomial distribution with given probs and count.

    Args:
        probs: A 1-dimensional array of normalized probabilities.
        count: The number of draws in a multinomial distribution.

    Returns:
        A number in [0, count * len(probs)] representing entropy.
    """
    assert count > 0
    multi_probs = probs
    for _ in range(count - 1):
        if len(probs) > 2:
            raise NotImplementedError(
                'Only categorical and binomial are supported')
        multi_probs = np.convolve(multi_probs, probs)
    return entropy(multi_probs)
项目:treecat    作者:posterior    | 项目源码 | 文件源码
def observed_perplexity(self, counts):
        """Compute perplexity = exp(entropy) of observed variables.

        Perplexity is an information theoretic measure of the number of
        clusters or latent classes. Perplexity is a real number in the range
        [1, M], where M is model_num_clusters.

        Args:
            counts: A [V]-shaped array of multinomial counts.

        Returns:
            A [V]-shaped numpy array of perplexity.
        """
        V, E, M, R = self._VEMR
        if counts is not None:
            counts = np.ones(V, dtype=np.int8)
        assert counts.shape == (V, )
        assert counts.dtype == np.int8
        assert np.all(counts > 0)
        observed_entropy = np.empty(V, dtype=np.float32)
        for v in range(V):
            beg, end = self._ragged_index[v:v + 2]
            probs = np.dot(self._feat_cond[beg:end, :], self._vert_probs[v, :])
            observed_entropy[v] = multinomial_entropy(probs, counts[v])
        return np.exp(observed_entropy)
项目:treecat    作者:posterior    | 项目源码 | 文件源码
def observed_perplexity(self, counts):
        """Compute perplexity = exp(entropy) of observed variables.

        Perplexity is an information theoretic measure of the number of
        clusters or observed classes. Perplexity is a real number in the range
        [1, dim[v]], where dim[v] is the number of categories in an observed
        categorical variable or 2 for an ordinal variable.

        Args:
            counts: A [V]-shaped array of multinomial counts.

        Returns:
            A [V]-shaped numpy array of perplexity.
        """
        result = self._ensemble[0].observed_perplexity(counts)
        for server in self._ensemble[1:]:
            result += server.observed_perplexity(counts)
        result /= len(self._ensemble)
        return result
项目:geomdn    作者:afshinrahimi    | 项目源码 | 文件源码
def get_local_words(preds, vocab, NEs=[], k=50):
    """
    given the word probabilities over many coordinates,
    first normalize the probability of each word in different
    locations to get a probability distribution, then compute
    the entropy of the word's distribution over all coordinates
    and return the words that are low entropy and are not
    named entities.
    """
    #normalize the probabilites of each vocab using entropy
    normalized_preds = normalize(preds, norm='l1', axis=0)
    entropies = stats.entropy(normalized_preds)
    sorted_indices = np.argsort(entropies)
    sorted_local_words = np.array(vocab)[sorted_indices].tolist()


    filtered_local_words = []
    NEset = set(NEs)
    for word in sorted_local_words:
        if word in NEset: continue
        filtered_local_words.append(word)
    return filtered_local_words[0:k]
项目:alp    作者:davefernig    | 项目源码 | 文件源码
def __query_by_committee(self, clf, X_unlabeled):
        num_classes = len(clf[0].classes_)
        C = len(clf)
        preds = []

        if self.strategy == 'vote_entropy':
            for model in clf:
                y_out = map(int, model.predict(X_unlabeled))
                preds.append(np.eye(num_classes)[y_out])

            votes = np.apply_along_axis(np.sum, 0, np.stack(preds)) / C
            return np.apply_along_axis(entropy, 1, votes)

        elif self.strategy == 'average_kl_divergence':
            for model in clf:
                preds.append(model.predict_proba(X_unlabeled))

            consensus = np.mean(np.stack(preds), axis=0)
            divergence = []
            for y_out in preds:
                divergence.append(entropy(consensus.T, y_out.T))

            return np.apply_along_axis(np.mean, 0, np.stack(divergence))
项目:StageDP    作者:EastonWang    | 项目源码 | 文件源码
def select(self, features, freq_table):
        """ Select features via some criteria

        :type features: dict
        :param features: features vocab

        :type freq_table: 2-D numpy.array
        :param freq_table: frequency table with rows as features,
                          columns as frequency values
        """
        if self.method == 'frequency':
            feat_vals = self.frequency(features, freq_table)
        elif self.method == 'entropy':
            feat_vals = self.entropy(features, freq_table)
        elif self.method == 'freq-entropy':
            feat_vals = self.freq_entropy(features, freq_table)
        else:
            raise KeyError("Unrecognized method")
        new_features = self.rank(feat_vals)
        return new_features
项目:catchy    作者:jvbalen    | 项目源码 | 文件源码
def parse_feature(feature):
    """ Parse feature string into
            (feature name, [1st order aggregates], [2nd order aggregates]).

        'Grammar':
        - feature name and aggregates are separated by dots, e.g. 'mfcc.entropy'
        - feature name is first and contains no dots
        - first order and second order aggregates are separated by one of 2 keywords:
            'corpus' or 'song'

        Ex.:
        >>> parse_features('loudness.mean.song.pdf.log')
        ('loudness', ['mean'], ['song', 'pdf', 'log'])
    """
    s = np.array(feature.split('.'))
    split_points = (s == 'corpus') | (s == 'song')
    split_points = np.nonzero(split_points)[0] if any(split_points) else [len(s)]
    return s[0], s[1:split_points[0]].tolist(), s[split_points[-1]:].tolist()
项目:cptm    作者:NLeSC    | 项目源码 | 文件源码
def jsd_opinions(co):
    """Calculate Jensen-Shannon divergence between (contrastive) opinions.

    Implements Jensen-Shannon divergence between (contrastive) opinions as
    described in [Fang et al., 2012] section 3.2.

    Parameter:
        co : numpy ndarray
        A numpy ndarray containing (contrastive) opinions (see
        contrastive_opinions(query, topics, opinions, nks))

    Returns:
        float
        The Jensen-Shannon divergence between the contrastive opinions.
    """
    logger.debug('calculate Jensen-Shannon divergence between (contrastive) '
                 'opinions')

    nPerspectives = co.shape[1]

    result = np.zeros(nPerspectives, dtype=np.float)
    p_avg = np.mean(co, axis=1)
    for persp in range(nPerspectives):
        result[persp] = entropy(co[:, persp], qk=p_avg, base=2)
    return np.mean(result)
项目:scanpy    作者:theislab    | 项目源码 | 文件源码
def aga_expression_entropies(adata):
    """Compute the median expression entropy for each node-group.

    Parameters
    ----------
    adata : AnnData
        Annotated data matrix.

    Returns
    -------
    entropies : list
        Entropies of median expressions for each node.
    """
    from scipy.stats import entropy
    groups_order, groups_masks = utils.select_groups(adata,
                                                     key=adata.uns['aga_groups_key'])
    entropies = []
    for mask in groups_masks:
        X_mask = adata.X[mask]
        x_median = np.median(X_mask, axis=0)
        x_probs = (x_median - np.min(x_median)) / (np.max(x_median) - np.min(x_median))
        entropies.append(entropy(x_probs))
    return entropies
项目:DebateAnalysis    作者:Lingistic    | 项目源码 | 文件源码
def build_interaction_graph(mallet_model, threshold):
    g = networkx.Graph()
    topic_matrix = model.theta
    for i in xrange(topic_matrix.shape[1]):
        print i
        for j in xrange(i+1, topic_matrix.shape[1]):
            divergence_ij = stats.entropy(topic_matrix[:,i], topic_matrix[:,j])
            divergence_ji = stats.entropy(topic_matrix[:,j], topic_matrix[:,i])
            # quick and dirty "symmetrization" plus inversion
            inverse_divergence_sym = float(1/(divergence_ij + divergence_ji))
            if inverse_divergence_sym >= threshold:
                g.add_node(j, label=', '.join(mallet_model.list_topic(j, 3)))
                g.add_edge(i, j, weight=inverse_divergence_sym)
            else:
                g.add_node(i)

    for i in xrange(topic_matrix.shape[1]):
        if len(g.edge[i]) == 0:
            g.remove_node(i)
    for i in xrange(topic_matrix.shape[1]):
        if i in g.node and len(g.node[i]) == 0 and len(g.edge[i]) != 0:
            print i
            g.add_node(i, label=', '.join(mallet_model.list_topic(i, 3)))
    return g
项目:treecat    作者:posterior    | 项目源码 | 文件源码
def correlation(probs):
    """Compute correlation rho(X,Y) = sqrt(1 - exp(-2 I(X;Y))).

    Args:
        probs: An [M, M]-shaped numpy array representing a joint distribution.

    Returns:
        A number in [0,1) representing the information-theoretic correlation.
    """
    assert len(probs.shape) == 2
    assert probs.shape[0] == probs.shape[1]
    mutual_information = (entropy(probs.sum(0)) + entropy(probs.sum(1)) -
                          entropy(probs.flatten()))
    return np.sqrt(1.0 - np.exp(-2.0 * mutual_information))
项目:treecat    作者:posterior    | 项目源码 | 文件源码
def observed_perplexity(self, counts):
        """Compute perplexity = exp(entropy) of observed variables."""
项目:treecat    作者:posterior    | 项目源码 | 文件源码
def latent_perplexity(self):
        """Compute perplexity = exp(entropy) of latent variables."""
项目:treecat    作者:posterior    | 项目源码 | 文件源码
def latent_perplexity(self):
        """Compute perplexity = exp(entropy) of latent variables.

        Perplexity is an information theoretic measure of the number of
        clusters or latent classes. Perplexity is a real number in the range
        [1, M], where M is model_num_clusters.

        Returns:
            A [V]-shaped numpy array of perplexity.
        """
        result = self._ensemble[0].latent_perplexity()
        for server in self._ensemble[1:]:
            result += server.latent_perplexity()
        result /= len(self._ensemble)
        return result
项目:treecat    作者:posterior    | 项目源码 | 文件源码
def observed_perplexity(self):
        """Compute perplexity = exp(entropy) of observed variables.

        Perplexity is an information theoretic measure of the number of
        clusters or observed classes. Perplexity is a real number in the range
        [1, dim[v]], where dim[v] is the number of categories in an observed
        categorical variable or 2 for an ordinal variable.

        Returns:
            A [V]-shaped numpy array of perplexity.
        """
        return self._server.observed_perplexity(self._counts)
项目:treecat    作者:posterior    | 项目源码 | 文件源码
def latent_perplexity(self):
        """Compute perplexity = exp(entropy) of latent variables.

        Perplexity is an information theoretic measure of the number of
        clusters or latent classes. Perplexity is a real number in the range
        [1, M], where M is model_num_clusters.

        Returns:
            A [V]-shaped numpy array of perplexity.
        """
        return self._server.latent_perplexity()
项目:bnn-analysis    作者:myshkov    | 项目源码 | 文件源码
def kl_divergence(p_samples, q_samples):
    # estimate densities
    # p_samples = np.nan_to_num(p_samples)
    # q_samples = np.nan_to_num(q_samples)

    if isinstance(p_samples, tuple):
        idx, p_samples = p_samples

        if idx not in _cached_p_pdf:
            _cached_p_pdf[idx] = sc.gaussian_kde(p_samples)

        p_pdf = _cached_p_pdf[idx]
    else:
        p_pdf = sc.gaussian_kde(p_samples)

    q_pdf = sc.gaussian_kde(q_samples)

    # joint support
    left = min(min(p_samples), min(q_samples))
    right = max(max(p_samples), max(q_samples))

    p_samples_num = p_samples.shape[0]
    q_samples_num = q_samples.shape[0]

    # quantise
    lin = np.linspace(left, right, min(max(p_samples_num, q_samples_num), MAX_GRID_POINTS))
    p = p_pdf.pdf(lin)
    q = q_pdf.pdf(lin)

    # KL
    kl = min(sc.entropy(p, q), MAX_KL)

    return kl
项目:slda    作者:Savvysherpa    | 项目源码 | 文件源码
def check_KL_divergence(topics, results, thresh):
    for res in results:
        minimized_KL = 1
        for topic in topics:
            KL = KL_divergence(topic, res)
            if KL < minimized_KL:
                minimized_KL = KL
        print(minimized_KL)
        assert minimized_KL < thresh
项目:slda    作者:Savvysherpa    | 项目源码 | 文件源码
def check_KL_divergence(topics, results, thresh):
    for res in results:
        minimized_KL = 1
        for topic in topics:
            KL = KL_divergence(topic, res)
            if KL < minimized_KL:
                minimized_KL = KL
        print(minimized_KL)
        assert minimized_KL < thresh
项目:slda    作者:Savvysherpa    | 项目源码 | 文件源码
def JSD(P, Q):
    M = 0.5 * (P + Q)
    return 0.5 * (entropy(P, M) + entropy(Q, M))
项目:geomdn    作者:afshinrahimi    | 项目源码 | 文件源码
def get_local_words(preds, vocab, NEs=[], k=50):
    #normalize the probabilites of each vocab
    normalized_preds = normalize(preds, norm='l1', axis=0)
    entropies = stats.entropy(normalized_preds)
    sorted_indices = np.argsort(entropies)
    sorted_local_words = np.array(vocab)[sorted_indices].tolist()
    filtered_local_words = []
    NEset = set(NEs)
    for word in sorted_local_words:
        if word in NEset: continue
        filtered_local_words.append(word)
    return filtered_local_words[0:k]
项目:quoll    作者:LanguageMachines    | 项目源码 | 文件源码
def calculate_entropy(self,labelfracs):
        return stats.entropy(labelfracs)
项目:quoll    作者:LanguageMachines    | 项目源码 | 文件源码
def calculateIG(self,groups,labels):
        # current entropy
        labelfracs = self.obtain_labelfracs(labels)
        current_entropy = self.calculate_entropy(labelfracs)
        # entropy of each grouping
        group_entropy = []
        for group in groups:
            labelfracs = self.obtain_labelfracs(group)
            group_entropy.append((len(group)/len(labels)) * self.calculate_entropy(labelfracs))
        infogain = current_entropy - sum(group_entropy)
        return infogain
项目:NuGridPy    作者:NuGrid    | 项目源码 | 文件源码
def compare_entropy(name_img1,name_img2,method="rmq"):
     '''Compare two images by the Kullback-Leibler divergence

     Parameters
     ----------
     name_img1 : string
       filename of image 1 (png format)

     name_img2 : string
       filename of image 2 (png format)

     Returns
     -------
     S : float
        Kullback-Leibler divergence S = sum(pk * log(pk / qk), axis=0)

     Note
     ----
     See http://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.entropy.html
     '''
     img1 = mpimg.imread(name_img1)
     img2 = mpimg.imread(name_img2)
     fimg1 = img1.flatten()
     fimg2 = img2.flatten()
     if method == "KL-div":
          eps = 0.0001
          S = stats.entropy(fimg2+eps,fimg1+eps)
          S = numpy.log10(S)
     elif method == "rmq":
          fdiff=fimg1-fimg2
          fdiff_sqr = fdiff**4
          S = (fdiff_sqr.sum())**(old_div(1.,4))

     return S,fimg1, fimg2
项目:LDA_RecEngine    作者:easonchan1213    | 项目源码 | 文件源码
def KLDivergenceSim(a,b,topics):
    from scipy.stats import entropy
    import math
    a = fill_list_from_dict(a,topics)
    b = fill_list_from_dict(b,topics)
    entropyOf_A_to_B = entropy(a,b)
    entropyOf_B_to_A = entropy(b,a)
    minusSummedEntropy = -(entropyOf_A_to_B+entropyOf_B_to_A)
    return math.exp(minusSummedEntropy)
项目:CS-SMAF    作者:brian-cleary    | 项目源码 | 文件源码
def print_performance(self,Z,itr,Z_series=(1.,1.)):
        Yhat = self.get_Yhat(Z)
        fit = 1 - np.linalg.norm(self.Y - Yhat)**2/np.linalg.norm(self.Y)**2
        r2 = (1 - distance.correlation(self.Y.flatten(),Yhat.flatten()))**2
        print 'itr: %d, fit: %f, r2: %f, Z entropy: %f, Z min: %f, Z max: %f, Z change: %f' % (itr,fit,r2,
        np.average([np.exp(entropy(abs(z))) for z in Z.T]),Z.min(),Z.max(),
        np.linalg.norm(Z_series[-2] - Z_series[-1])/np.linalg.norm(Z_series[-2]))
        self.fit = (fit,r2)
项目:alp    作者:davefernig    | 项目源码 | 文件源码
def __uncertainty_sampling(self, clf, X_unlabeled):
        probs = clf.predict_proba(X_unlabeled)

        if self.strategy == 'least_confident':
            return 1 - np.amax(probs, axis=1)

        elif self.strategy == 'max_margin':
            margin = np.partition(-probs, 1, axis=1)
            return -np.abs(margin[:,0] - margin[:, 1])

        elif self.strategy == 'entropy':
            return np.apply_along_axis(entropy, 1, probs)
项目:StageDP    作者:EastonWang    | 项目源码 | 文件源码
def entropy(self, features, freq_table):
        """
        """
        feat_vals = {}
        for (feat, idx) in features.items():
            freq = freq_table[idx, :]
            feat_vals[feat] = 1 / (entropy(freq) + 1e-3)
        return feat_vals
项目:StageDP    作者:EastonWang    | 项目源码 | 文件源码
def freq_entropy(self, features, freq_table):
        """
        """
        feat_vals = {}
        feat_freqs = self.frequency(features, freq_table)
        feat_ents = self.entropy(features, freq_table)
        for feat in features.keys():
            freq = feat_freqs[feat]
            ent = feat_ents[feat]
            feat_vals[feat] = numpy.log(freq + 1e-3) * (ent + 1e-3)
        return feat_vals
项目:StageDP    作者:EastonWang    | 项目源码 | 文件源码
def test():
    vocab = {'hello': 0, 'data': 1, 'computer': 2}
    freq_table = [[23, 23, 23, 23], [23, 1, 4, 5], [1, 34, 1, 1]]
    freq_table = numpy.array(freq_table)
    fs = FeatureSelector(topn=2, method='freq-entropy')
    newvocab = fs.select(vocab, freq_table)
    print(newvocab)
项目:CustomerSim    作者:sisl    | 项目源码 | 文件源码
def KL_validate(data_true, data_predicted, n_bins, x_range, n_samples=10000):
    '''"Pr(KL(simulated data||original) > KL(bootstrap original||bootstrap original))'''

    n = data_true.shape[0]

    hist_true, _ = np.histogram(data_true, bins=n_bins, range=x_range)
    hist_predicted, bin_edges = np.histogram(data_predicted, bins=n_bins, range=x_range)

    simulated_KL = sc.entropy(hist_true+1,hist_predicted+1)
    subsampled_KL = []

    for i in xrange(n_samples):
        index1 = np.random.choice(n, n, replace=True)
        index2 = np.random.choice(n, n, replace=True)
        sample1 = data_true[index1]
        sample2 = data_true[index2]
        hist_sample1, _ = np.histogram(sample1, bins=n_bins, range=x_range)
        hist_sample2, _ = np.histogram(sample2, bins=n_bins, range=x_range)
        subsampled_KL.append(sc.entropy(hist_sample2+1,hist_sample1+1))

    subsampled_KL = sorted(subsampled_KL)
    pval = sum( simulated_KL < i for i in subsampled_KL) / float(n_samples)
    conf_interval = (0,subsampled_KL[int(math.ceil(n_samples*0.95))-1])
    return simulated_KL,conf_interval,pval,n

# CONTOUR PLOTS
项目:catchy    作者:jvbalen    | 项目源码 | 文件源码
def first_order(feature, aggregates, verbose=False):
    if not type(aggregates) is list:
        aggregates = [aggregates]
    for aggregate in aggregates:
        if verbose:
            print('        first order computation: ' + aggregate)
        if aggregate == 'log':
            feature = np.log(feature)
        elif aggregate == 'sqrt':
            feature = np.sqrt(feature)
        elif aggregate == 'minlog':
            feature = np.log(1 - feature)
        elif aggregate == 'minsqrt':
            feature = np.sqrt(1 - feature)
        elif aggregate == 'mean':
            # feature = np.mean(feature, axis=0)
            feature = np.nanmean(feature, axis=0)
        elif aggregate == 'var':
            feature = np.var(feature, axis=0)
        elif aggregate == 'std':
            # feature = np.std(feature, axis=0)
            feature = np.nanstd(feature, axis=0)
        elif aggregate == 'stdmean':
            feature = np.hstack([np.mean(feature, axis=0), np.std(feature, axis=0)])
        elif aggregate == 'cov':
            feature = np.flatten(np.cov(feature, axis=0))
        elif aggregate == 'totvar':
            feature = np.array([np.mean(np.var(feature, axis=0))])
        elif aggregate == 'totstd':
            feature = np.array([np.mean(np.std(feature, axis=0))])
        elif aggregate == 'entropy':
            feature = feature.flatten()
            feature = np.array([stats.entropy(feature)])
        elif aggregate == 'normentropy':
            feature = feature.flatten()
            feature = np.array([stats.entropy(feature) / np.log(feature.size)])
        elif aggregate == 'information':
            feature = - np.log(feature)

    return feature
项目:leven-squash    作者:dwcoates    | 项目源码 | 文件源码
def get_entropy(self, probs):
        """
        Estimate the entropy of string in Shannons. That is, this method
        assumes that the frequency of characters in the input string is
        exactly equal to the probability mass function.
        """
        # calculates entropy in Nats
        ent_nat = entropy(probs)

        # convert to Shannons
        ent_shan = ent_nat * 1 / np.log(2)

        return ent_shan
项目:nli_generation    作者:jstarc    | 项目源码 | 文件源码
def diversity(dev, gen_test, beam_size, hypo_len, noise_size, per_premise, samples):
    step = len(dev[0]) / samples
    sind = [i * step for i in range(samples)]
    p = Progbar(per_premise * samples)
    for i in sind:
        hypos = []
        unique_words = []
        hypo_list = []
        premise = dev[0][i]
        prem_list = set(cut_zeros(list(premise)))        
        while len(hypos) < per_premise:
            label = np.argmax(dev[2][i])
            words = single_generate(premise, label, gen_test, beam_size, hypo_len, noise_size)
            hypos += [str(ex) for ex in words]
            unique_words += [int(w) for ex in words for w in ex if w > 0]
            hypo_list += [set(cut_zeros(list(ex))) for ex in words]

        jacks = []  
        prem_jacks = []
        for u in range(len(hypo_list)):
            sim_prem = len(hypo_list[u] & prem_list)/float(len(hypo_list[u] | prem_list))
            prem_jacks.append(sim_prem)
            for v in range(u+1, len(hypo_list)):
                sim = len(hypo_list[u] & hypo_list[v])/float(len(hypo_list[u] | hypo_list[v]))
                jacks.append(sim)
        avg_dist_hypo = 1 -  np.mean(jacks)
        avg_dist_prem = 1 -  np.mean(prem_jacks)
        d = entropy(Counter(hypos).values()) 
        w = entropy(Counter(unique_words).values())
        p.add(len(hypos), [('diversity', d),('word_entropy', w),('avg_dist_hypo', avg_dist_hypo), ('avg_dist_prem', avg_dist_prem)])
    arrd = p.sum_values['diversity']
    arrw = p.sum_values['word_entropy']
    arrj = p.sum_values['avg_dist_hypo']
    arrp = p.sum_values['avg_dist_prem']

    return arrd[0] / arrd[1], arrw[0] / arrw[1], arrj[0] / arrj[1],  arrp[0] / arrp[1]
项目:twitter_LDA_topic_modeling    作者:kenneth-orton    | 项目源码 | 文件源码
def jensen_shannon(P, Q):
    M = 0.5 * (P + Q)
    return 0.5 * (entropy(P, M) + entropy(Q, M))
项目:twitter_LDA_topic_modeling    作者:kenneth-orton    | 项目源码 | 文件源码
def jensen_shannon_divergence(P, Q):
    _P = np.array(P) / norm(np.array(P), ord=1)
    _Q = np.array(Q) / norm(np.array(Q), ord=1)
    _M = 0.5 * (_P + _Q)
    return 0.5 * (entropy(_P, _M) + entropy(_Q, _M))
项目:CElegansBehaviour    作者:ChristophKirst    | 项目源码 | 文件源码
def jensen_shannon_divergence(p,q):
  """Jensen-Shannon distance between distributions p and q"""
  m = (p+q)/2.0;
  return stats.entropy(p,m) + stats.entropy(q,m);
项目:Building-Machine-Learning-Systems-With-Python-Second-Edition    作者:PacktPublishing    | 项目源码 | 文件源码
def mutual_info(x, y, bins=10):
    counts_xy, bins_x, bins_y = np.histogram2d(x, y, bins=(bins, bins))
    counts_x, bins = np.histogram(x, bins=bins)
    counts_y, bins = np.histogram(y, bins=bins)

    counts_xy += 1
    counts_x += 1
    counts_y += 1
    P_xy = counts_xy / np.sum(counts_xy, dtype=float)
    P_x = counts_x / np.sum(counts_x, dtype=float)
    P_y = counts_y / np.sum(counts_y, dtype=float)

    I_xy = np.sum(P_xy * np.log2(P_xy / (P_x.reshape(-1, 1) * P_y)))

    return I_xy / (entropy(counts_x) + entropy(counts_y))
项目:mlprojects-py    作者:srinathperera    | 项目源码 | 文件源码
def create_window_based_features(data, window_size):
    central_fn = np.mean
    ma1 = calcuate_window_operation(data, window_size, central_fn)
    ma2 = calcuate_window_operation(data, 2 * window_size, central_fn)
    ma4 = calcuate_window_operation(data, 4 * window_size, central_fn)
    ma8 = calcuate_window_operation(data, 8 * window_size, central_fn)


    entropy = calcuate_window_operation(data, window_size, stats.entropy)
    stddev = calcuate_window_operation(data, window_size, np.std)
    medain_weeksbefore = value_before_period(data, 7)

    return np.column_stack((ma1, ma2, ma4, ma8, entropy, stddev, medain_weeksbefore))

# do cross volidation http://stackoverflow.com/questions/533905/get-the-cartesian-product-of-a-series-of-lists-in-python
项目:scikit-gstat    作者:mmaelicke    | 项目源码 | 文件源码
def entropy(X, bins=None):
    """
    Use the Shannon Entropy H to describe the distribution of the given sample.
    For calculating the Shannon Entropy, the bin edges are needed and can be passed as pk.
    If pk is None, these edges will be calculated using the numpy.histogram function with bins='fq'.
    This uses Freedman Diacons Estimator and is fairly resilient to outliers.
    If the input data X is 2D (Entropy for more than one bin needed), it will derive the histogram once and
    use the same edges in all bins.
    CAUTION: this is actually an changed behaviour to scikit-gstat<=0.1.4

    :param X:  np.ndarray with the given sample to calculate the Shannon entropy from
    :param bins: The bin edges for entropy calculation, or an amount of even spaced bins
    :return:
    """
    _X = np.array(X)

    if any([isinstance(_, (list, np.ndarray)) for _ in _X]):
        # if bins is not set, use the histogram over the full value range
        if bins is None:
            # could not fiugre out a better way here. I need the values before calculating the entropy
            # in order to use the full value range in all bins
            vals = [[np.abs(_[i] - _[i + 1]) for i in np.arange(0, len(_), 2)] for _ in _X]
            bins = np.histogram(vals, bins=15)[1][1:]
        return np.array([entropy(_, bins=bins) for _ in _X])

    # check even
    if len(_X) % 2 > 0:
        raise ValueError('The sample does not have an even length: {}'.format(_X))

    # calculate the values
    vals = [np.abs(_X[i] - _X[i + 1]) for i in np.arange(0, len(_X), 2)]

    # claculate the bins
    if bins is None:
        bins = 15
    pk = np.histogram(vals, bins)[0]

    return scipy_entropy(pk=pk)
项目:nonce2vec    作者:minimalparts    | 项目源码 | 文件源码
def kullback_leibler(vec1, vec2, num_features=None):
    """
    A distance metric between two probability distributions.
    Returns a distance value in range <0,1> where values closer to 0 mean less distance (and a higher similarity)
    Uses the scipy.stats.entropy method to identify kullback_leibler convergence value.
    If the distribution draws from a certain number of docs, that value must be passed.
    """
    if scipy.sparse.issparse(vec1):
        vec1 = vec1.toarray()
    if scipy.sparse.issparse(vec2):
        vec2 = vec2.toarray() # converted both the vectors to dense in case they were in sparse matrix 
    if isbow(vec1) and isbow(vec2): # if they are in bag of words format we make it dense
        if num_features != None: # if not None, make as large as the documents drawing from
            dense1 = sparse2full(vec1, num_features)
            dense2 = sparse2full(vec2, num_features)
            return entropy(dense1, dense2)
        else:
            max_len = max(len(vec1), len(vec2))
            dense1 = sparse2full(vec1, max_len)
            dense2 = sparse2full(vec2, max_len)
            return entropy(dense1, dense2)
    else:
        # this conversion is made because if it is not in bow format, it might be a list within a list after conversion
        # the scipy implementation of Kullback fails in such a case so we pick up only the nested list.
        if len(vec1) == 1:
            vec1 = vec1[0]
        if len(vec2) == 1:
            vec2 = vec2[0]
        return scipy.stats.entropy(vec1, vec2)
项目:EvadeML-Zoo    作者:mzweilin    | 项目源码 | 文件源码
def kl(x1, x2):
    assert x1.shape == x2.shape
    # x1_2d, x2_2d = reshape_2d(x1), reshape_2d(x2)

    # Transpose to [?, #num_examples]
    x1_2d_t = x1.transpose()
    x2_2d_t = x2.transpose()

    # pdb.set_trace()
    e = entropy(x1_2d_t, x2_2d_t)
    e[np.where(e==np.inf)] = 2
    return e
项目:MLAlgorithms    作者:rushter    | 项目源码 | 文件源码
def f_entropy(p):
    # Convert values to probability
    p = np.bincount(p) / float(p.shape[0])

    ep = stats.entropy(p)
    if ep == -float('inf'):
        return 0.0
    return ep
项目:xam    作者:MaxHalford    | 项目源码 | 文件源码
def calc_class_entropy(y):
    class_counts = stats.itemfreq(y)[:, 1]
    return stats.entropy(class_counts, base=2)
项目:ohmnet    作者:marinkaz    | 项目源码 | 文件源码
def kullback_leibler(vec1, vec2, num_features=None):
    """
    A distance metric between two probability distributions.
    Returns a distance value in range <0,1> where values closer to 0 mean less distance (and a higher similarity)
    Uses the scipy.stats.entropy method to identify kullback_leibler convergence value.
    If the distribution draws from a certain number of docs, that value must be passed.
    """
    if scipy.sparse.issparse(vec1):
        vec1 = vec1.toarray()
    if scipy.sparse.issparse(vec2):
        vec2 = vec2.toarray() # converted both the vectors to dense in case they were in sparse matrix 
    if isbow(vec1) and isbow(vec2): # if they are in bag of words format we make it dense
        if num_features != None: # if not None, make as large as the documents drawing from
            dense1 = sparse2full(vec1, num_features)
            dense2 = sparse2full(vec2, num_features)
            return entropy(dense1, dense2)
        else:
            max_len = max(len(vec1), len(vec2))
            dense1 = sparse2full(vec1, max_len)
            dense2 = sparse2full(vec2, max_len)
            return entropy(dense1, dense2)
    else:
        # this conversion is made because if it is not in bow format, it might be a list within a list after conversion
        # the scipy implementation of Kullback fails in such a case so we pick up only the nested list.
        if len(vec1) == 1:
            vec1 = vec1[0]
        if len(vec2) == 1:
            vec2 = vec2[0]
        return scipy.stats.entropy(vec1, vec2)
项目:neurotools    作者:michaelerule    | 项目源码 | 文件源码
def barHhv(s):
        '''
        Conditional entropy H(h|v)
        '''
        return np.sum(s.Qv*s.Hhv)
项目:neurotools    作者:michaelerule    | 项目源码 | 文件源码
def barHvh(s):
        '''
        Conditional entropy H(v|h)
        '''
        return np.sum(s.Qh*s.Hvh)

    # ---------------------------------------------------------------------
    # Energies of samples
项目:neurotools    作者:michaelerule    | 项目源码 | 文件源码
def short_report(s):
        Hhs = np.sum(rb.bitent(s.Ph))
        Hvs = np.sum(rb.bitent(s.Pv))
        # print a short report
        print('\nRBM dataset Ns=%s Nh=%s Nv=%s'%(s.Ns,s.Nh,s.Nv))
        print('Vis capacity, maximum',np.sum(rb.bitent(0.5*np.ones(s.Nv))))
        print('Hid capacity, maximum',np.sum(rb.bitent(0.5*np.ones(s.Nh))))
        print('Vis entropy , sampled',Hvs)
        print('Hid entropy , sampled',Hhs)
        print('Entropy difference   ',(Hhs-Hvs))
        print('Mean hidden rate     ',np.mean(s.Ph))
        print('Mean hidde complexity',rb.bitent(np.mean(s.Ph))*s.Nh)
项目:neurotools    作者:michaelerule    | 项目源码 | 文件源码
def long_report(s):
        lgE = np.log2(np.e)
        # Long report
        # print('\nFound dataset %s T=%s Nh=%s Nv=%s'%(DIR,T,Nh,Nv))
        # print('DKL                   %0.2f'%DKL)
        print('\nRBM dataset Ns=%s Nh=%s Nv=%s'%(s.Ns,s.Nh,s.Nv))
        # Hidden layer entropy
        print('==Hidden layer entropy==')
        print('Hid capacity, maximum %0.2f'%(np.sum(rb.bitent(0.5*np.ones(s.Nh)))))
        print('Hid entropy , sampled %0.2f'%(s.Hhs))
        print('Entropy hid sample is %0.2f'%(entropy(s.Qh,base=2)))
        print('<<Eh>h|v>v sampled is %0.2f'%(s.barEhhv*lgE))
        print('<<Eh>h|v>v ufield  is %0.2f'%(s.barEhhv_meanfield*lgE))
        print('Mean hidde complexity %0.2f'%(rb.bitent(np.mean(s.Ph))*s.Nh))
        print('Mean hidden rate      %0.2f'%(np.mean(s.Ph)))
        # Conditional entropy
        print('==Conditional entropy==')
        print('Entropy difference    %0.2f'%(s.Hhs-s.Hvs))
        print('<H_h|v>v           is %0.2f'%(s.barHhv*lgE))
        # Likelihoods
        print('==Negative log-likelihood==')
        print('<<Ev|h>h|v>v sampl is %0.2f'%(s.barEvhhv *lgE))
        print('<<Ev|h>h|v>v ufild is %0.2f'%(s.barEvhhv_meanfield*lgE))
        # KL divergences
        print('==KL divergences==')
        print('<Dkl(h|v||h)>v sam is %0.2f'%(s.barDKLhv*lgE))
        print('<Dkl(h|v||h)>v uf1 is %0.2f'%(s.barDKLhv_meanfield*lgE))
        # Visible entropy; These should be close in value
        print('==Visible layer entropy==')
        print('Vis capacity, maximum %0.2f'%(np.sum(rb.bitent(0.5*np.ones(s.Nv)))))
        print('Vis entropy , sampled %0.2f'%(s.Hvs))
        print('Entropy vis sample is %0.2f'%(entropy(s.Qv,base=2)))
        print('<D(.)+<Ev|h>h|v>v sam %0.2f'%(s.barDKLhv*lgE+s.barEvhhv *lgE))
        print('<D(.)+<Ev|h>h|v>v uf1 %0.2f'%(s.barDKLhv_meanfield*lgE+s.barEvhhv_meanfield*lgE))
项目:Database-Generation-for-Itemset-Mining    作者:clezcano    | 项目源码 | 文件源码
def entropy(self, filename, delimeter, itemsetSize, minsup, fun): # fun defines use of build-in entropy or my own
        db = DataBase()
        db.readDB(filename, delimeter)
        dbElem = db.getDBElements()
        dbSize = db.size()

        kItemsetFreq = [float(db.getItemsetSup(set(itemset))) / dbSize for itemset in combinations(dbElem, itemsetSize)]
        sumFreq = sum(kItemsetFreq)
        kItemsetProb = (itemsetFreq / sumFreq for itemsetFreq in kItemsetFreq)
        kItemsetFreq.clear()
        db.getDataBase().clear()
        if fun == 1:
            return entropy(kItemsetProb, base=2)
        elif fun == 2:
            return self.calculateEntropy(kItemsetProb)