Is there a way to use the numpy.percentile function to compute weighted percentile? Or is anyone aware of an alternative python function to compute weighted percentile?
Unfortunately, numpy doesn't have built-in weighted functions for everything, but, you can always put something together.
def weight_array(ar, weights): zipped = zip(ar, weights) weighted =  for i in zipped: for j in range(i): weighted.append(i) return weighted np.percentile(weight_array(ar, weights), 25)
Completely vectorized numpy solution
Here is the code I'm using. It's not an optimal one (which I'm unable write in
numpy), but still much faster and more reliable than accepted solution
def weighted_quantile(values, quantiles, sample_weight=None, values_sorted=False, old_style=False): """ Very close to numpy.percentile, but supports weights. NOTE: quantiles should be in [0, 1]! :param values: numpy.array with data :param quantiles: array-like with many quantiles needed :param sample_weight: array-like of the same length as `array` :param values_sorted: bool, if True, then will avoid sorting of initial array :param old_style: if True, will correct output to be consistent with numpy.percentile. :return: numpy.array with computed quantiles. """ values = np.array(values) quantiles = np.array(quantiles) if sample_weight is None: sample_weight = np.ones(len(values)) sample_weight = np.array(sample_weight) assert np.all(quantiles >= 0) and np.all(quantiles <= 1), \ 'quantiles should be in [0, 1]' if not values_sorted: sorter = np.argsort(values) values = values[sorter] sample_weight = sample_weight[sorter] weighted_quantiles = np.cumsum(sample_weight) - 0.5 * sample_weight if old_style: # To be convenient with numpy.percentile weighted_quantiles -= weighted_quantiles weighted_quantiles /= weighted_quantiles[-1] else: weighted_quantiles /= np.sum(sample_weight) return np.interp(quantiles, weighted_quantiles, values)
weighted_quantile([1, 2, 9, 3.2, 4], [0.0, 0.5, 1.])
array([ 1. , 3.2, 9. ])
weighted_quantile([1, 2, 9, 3.2, 4], [0.0, 0.5, 1.], sample_weight=[2, 1, 2, 4, 1])
array([ 1. , 3.2, 9. ])
A quick solution, by first sorting and then interpolating:
def weighted_percentile(data, percents, weights=None): ''' percents in units of 1% weights specifies the frequency (count) of data. ''' if weights is None: return np.percentile(data, percents) ind=np.argsort(data) d=data[ind] w=weights[ind] p=1.*w.cumsum()/w.sum()*100 y=np.interp(percents, p, d) return y
Apologies for the additional (unoriginal) answer (not enough rep to comment on @nayyarv's). His solution worked for me (ie. it replicates the default behavior of
np.percentage), but I think you can eliminate the for loop with clues from how the original
np.percentage is written.
def weighted_percentile(a, q=np.array([75, 25]), w=None): """ Calculates percentiles associated with a (possibly weighted) array Parameters ---------- a : array-like The input array from which to calculate percents q : array-like The percentiles to calculate (0.0 - 100.0) w : array-like, optional The weights to assign to values of a. Equal weighting if None is specified Returns ------- values : np.array The values associated with the specified percentiles. """ # Standardize and sort based on values in a q = np.array(q) / 100.0 if w is None: w = np.ones(a.size) idx = np.argsort(a) a_sort = a[idx] w_sort = w[idx] # Get the cumulative sum of weights ecdf = np.cumsum(w_sort) # Find the percentile index positions associated with the percentiles p = q * (w.sum() - 1) # Find the bounding indices (both low and high) idx_low = np.searchsorted(ecdf, p, side='right') idx_high = np.searchsorted(ecdf, p + 1, side='right') idx_high[idx_high > ecdf.size - 1] = ecdf.size - 1 # Calculate the weights weights_high = p - np.floor(p) weights_low = 1.0 - weights_high # Extract the low/high indexes and multiply by the corresponding weights x1 = np.take(a_sort, idx_low) * weights_low x2 = np.take(a_sort, idx_high) * weights_high # Return the average return np.add(x1, x2) # Sample data a = np.array([1.0, 2.0, 9.0, 3.2, 4.0], dtype=np.float) w = np.array([2.0, 1.0, 3.0, 4.0, 1.0], dtype=np.float) # Make an unweighted "copy" of a for testing a2 = np.repeat(a, w.astype(np.int)) # Tests with different percentiles chosen q1 = np.linspace(0.0, 100.0, 11) q2 = np.linspace(5.0, 95.0, 10) q3 = np.linspace(4.0, 94.0, 10) for q in (q1, q2, q3): assert np.all(weighted_percentile(a, q, w) == np.percentile(a2, q))
I don' know what's Weighted percentile means, but from @Joan Smith's answer, It seems that you just need to repeat every element in
ar, you can use
import numpy as np np.repeat([1,2,3], [4,5,6])
the result is:
array([1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3])
I use this function for my needs:
def quantile_at_values(values, population, weights=None): values = numpy.atleast_1d(values).astype(float) population = numpy.atleast_1d(population).astype(float) # if no weights are given, use equal weights if weights is None: weights = numpy.ones(population.shape).astype(float) normal = float(len(weights)) # else, check weights else: weights = numpy.atleast_1d(weights).astype(float) assert len(weights) == len(population) assert (weights >= 0).all() normal = numpy.sum(weights) assert normal > 0. quantiles = numpy.array([numpy.sum(weights[population <= value]) for value in values]) / normal assert (quantiles >= 0).all() and (quantiles <= 1).all() return quantiles
- It is vectorized as far as I could go.
- It has a lot of sanity checks.
- It works with floats as weights.
- It can work without weights (→ equal weights).
- It can compute multiple quantiles at once.
Multiply results by 100 if you want percentiles instead of quantiles.
As mentioned in comments, simply repeating values is impossible for float weights, and impractical for very large datasets. There is a library that does weighted percentiles here: http://kochanski.org/gpk/code/speechresearch/gmisclib/gmisclib.weighted_percentile-module.html It worked for me.
def weighted_percentile(a, percentile = np.array([75, 25]), weights=None): """ O(nlgn) implementation for weighted_percentile. """ percentile = np.array(percentile)/100.0 if weights is None: weights = np.ones(len(a)) a_indsort = np.argsort(a) a_sort = a[a_indsort] weights_sort = weights[a_indsort] ecdf = np.cumsum(weights_sort) percentile_index_positions = percentile * (weights.sum()-1)+1 # need the 1 offset at the end due to ecdf not starting at 0 locations = np.searchsorted(ecdf, percentile_index_positions) out_percentiles = np.zeros(len(percentile_index_positions)) for i, empiricalLocation in enumerate(locations): # iterate across the requested percentiles if ecdf[empiricalLocation-1] == np.floor(percentile_index_positions[i]): # i.e. is the percentile in between 2 separate values uppWeight = percentile_index_positions[i] - ecdf[empiricalLocation-1] lowWeight = 1 - uppWeight out_percentiles[i] = a_sort[empiricalLocation-1] * lowWeight + \ a_sort[empiricalLocation] * uppWeight else: # i.e. the percentile is entirely in one bin out_percentiles[i] = a_sort[empiricalLocation] return out_percentiles
This is my function, it give identical behaviour to
np.percentile(np.repeat(a, weights), percentile)
With less memory overhead. np.percentile is an O(n) implementation so it's potentially faster for small weights. It has all the edge cases sorted out - it's an exact solution. The interpolation answers above assume linear, when it's a step for most of the case, except when the weight is 1.
Say we have data [1,2,3] with weights [3, 11, 7] and I want the 25% percentile. My ecdf is going to be [3, 10, 21] and I'm looking for the 5th value. The interpolation will see [3,1] and [10, 2] as the matches and interpolate giving 1.28 despite being entirely in the 2nd bin with a value of 2.
here my solution:
def my_weighted_perc(data,perc,weights=None): if weights==None: return nanpercentile(data,perc) else: d=data[(~np.isnan(data))&(~np.isnan(weights))] ix=np.argsort(d) d=d[ix] wei=weights[ix] wei_cum=100.*cumsum(wei*1./sum(wei)) return interp(perc,wei_cum,d)
it simply calculates the weighted CDF of the data and then it uses to estimate the weighted percentiles.