浏览代码

r.quantile, r.stats.quantile, lib/stats: fix quantile algorithm (#2108)

* use type 7 algorithm of Hyndman and Fan (1996) for quantiles, as is the default in R and numpy 
 * update manuals for `r.quantile`, `r.stats.quantile`
 * sync `r.stats quantile` to `r.quantile`
 * update test results for `r.neighbors`
Markus Metz 3 年之前
父节点
当前提交
c700a8a923

+ 2 - 1
lib/stats/c_percentile.c

@@ -17,7 +17,8 @@ void c_quant(DCELL * result, DCELL * values, int n, const void *closure)
 	return;
     }
 
-    k = n * quant;
+    /* algorithm type 7 of Hyndman and Fan (1996), default in R */
+    k = quant * (n - 1);
     i0 = (int)floor(k);
     i1 = (int)ceil(k);
 

+ 49 - 49
raster/r.neighbors/testsuite/test_r_neighbors.py

@@ -41,18 +41,18 @@ class TestNeighbors(TestCase):
                 "sum": 221701132.328964,
             },
             "perc90": {
-                "n": 2022154,
-                "null_cells": 2846,
+                "n": 2025000,
+                "null_cells": 0,
                 "cells": 2025000,
-                "min": 50.5460597991944,
-                "max": 153.0166015625,
-                "range": 102.470541763306,
-                "mean": 100.145484105243,
-                "mean_of_abs": 100.145484105243,
-                "stddev": 18.2647774894326,
-                "variance": 333.602096738483,
-                "coeff_var": 18.2382437437099,
-                "sum": 202509591.265354,
+                "min": 56.0450115203857,
+                "max": 156.317572021484,
+                "range": 100.272560501099,
+                "mean": 111.060386339868,
+                "mean_of_abs": 111.060386339868,
+                "stddev": 20.301968270214,
+                "variance": 412.169915644776,
+                "coeff_var": 18.2801167358501,
+                "sum": 224897282.338233,
             },
             "average": {
                 "n": 2025000,
@@ -116,43 +116,43 @@ class TestNeighbors(TestCase):
                 "n": 2025000,
                 "null_cells": 0,
                 "cells": 2025000,
-                "min": 55.5858826446533,
-                "max": 156.204046478271,
-                "range": 100.618163833618,
-                "mean": 109.505643774445,
-                "mean_of_abs": 109.505643774445,
-                "stddev": 20.3076172385227,
-                "variance": 412.399317906343,
-                "coeff_var": 18.5448133434578,
-                "sum": 221748928.643252,
+                "min": 55.5847009658813,
+                "max": 156.203791503906,
+                "range": 100.619090538025,
+                "mean": 109.503018547944,
+                "mean_of_abs": 109.503018547944,
+                "stddev": 20.3076172385226,
+                "variance": 412.398941046239,
+                "coeff_var": 18.5452494634586,
+                "sum": 221743612.559588,
             },
             0.03: {
                 "n": 2025000,
                 "null_cells": 0,
                 "cells": 2025000,
-                "min": 55.600062789917,
-                "max": 156.208636016846,
-                "range": 100.608573226929,
-                "mean": 109.552850010781,
-                "mean_of_abs": 109.552850010781,
-                "stddev": 20.3078616792495,
-                "variance": 412.409245983529,
-                "coeff_var": 18.5370455239192,
-                "sum": 221844521.271832,
+                "min": 55.5965177536011,
+                "max": 156.20787109375,
+                "range": 100.611353340149,
+                "mean": 109.544974331288,
+                "mean_of_abs": 109.544974331288,
+                "stddev": 20.3078125947647,
+                "variance": 412.407252384084,
+                "coeff_var": 18.5383334276472,
+                "sum": 221828573.020859,
             },
             0.95: {
-                "n": 2022154,
-                "null_cells": 2846,
+                "n": 2025000,
+                "null_cells": 0,
                 "cells": 2025000,
-                "min": 25.2730298995972,
-                "max": 153.0166015625,
-                "range": 127.743571662903,
-                "mean": 50.141999640313,
-                "mean_of_abs": 50.141999640313,
-                "stddev": 9.33982798423495,
-                "variance": 87.2323867750984,
-                "coeff_var": 18.6267561150991,
-                "sum": 101394845.140658,
+                "min": 56.0488119125366,
+                "max": 156.323718261719,
+                "range": 100.274906349182,
+                "mean": 111.161659406447,
+                "mean_of_abs": 111.161659406447,
+                "stddev": 20.3000320258108,
+                "variance": 412.091300248942,
+                "coeff_var": 18.2617209334619,
+                "sum": 225102360.298055,
             },
         },
         "test_standard_options_circular": {
@@ -174,15 +174,15 @@ class TestNeighbors(TestCase):
                 "n": 2025000,
                 "null_cells": 0,
                 "cells": 2025000,
-                "min": 56.4984405517578,
-                "max": 156.306396484375,
-                "range": 99.8079559326172,
-                "mean": 111.82648180616,
-                "mean_of_abs": 111.82648180616,
-                "stddev": 20.2829112001524,
-                "variance": 411.396486753269,
-                "coeff_var": 18.1378425508466,
-                "sum": 226448625.657474,
+                "min": 56.2710201263428,
+                "max": 156.299404907227,
+                "range": 100.028384780884,
+                "mean": 111.688409057181,
+                "mean_of_abs": 111.688409057181,
+                "stddev": 20.286687968792,
+                "variance": 411.54970874313,
+                "coeff_var": 18.1636466487815,
+                "sum": 226169028.340791,
             },
             "average": {
                 "n": 2025000,

+ 73 - 26
raster/r.quantile/main.c

@@ -18,7 +18,6 @@
 #include <grass/raster.h>
 #include <grass/glocale.h>
 
-/* TODO: replace long with either size_t or a guaranteed 64 bit integer */
 struct bin
 {
     size_t origin;
@@ -34,11 +33,11 @@ static DCELL *quants;
 static int num_slots;
 static size_t *slots;
 static DCELL slot_size;
-/* total should be a 64bit integer */
-static unsigned long total;
+static grass_int64 total;
 static size_t num_values;
 static unsigned short *slot_bins;
-static int num_bins;
+static int num_bins_alloc;
+static int num_bins_used;
 static struct bin *bins;
 static DCELL *values;
 
@@ -53,12 +52,38 @@ static inline int get_slot(DCELL c)
     return i;
 }
 
+/* get zero-based rank for quantile */
+/* generic formula for one-based rank
+ * rank = quant * (N + 1 - 2C) + C
+ * with quant = quantile, N = number of values, C = constant
+ * common values for C:
+ * C = 0
+ *   rank = quant * (N + 1)
+ *   recommended by NIST (National Institute of Standards and Technology)
+ *   https://www.itl.nist.gov/div898/handbook/prc/section2/prc262.htm
+ * C = 0.5
+ *   rank = quant * N + 0.5
+ *   Matlab
+ * C = 1
+ *   rank = quant * (N - 1) + 1
+ *   numpy, R, MS Excel, ...
+ *   Noted as an alternative by NIST */
 static inline double get_quantile(int n)
 {
-    if (n >= num_quants)
+    double rnk;
+
+    if (n >= num_quants) {
+	/* stop condition for initialize_bins() */
 	return (double)total + total;
+    }
 
-    return (double)total * quants[n];
+    rnk = quants[n] * (total - 1);
+    if (rnk < 0)
+	rnk = 0;
+    if (rnk > total - 1)
+	rnk = total - 1;
+
+    return rnk;
 }
 
 static void get_slot_counts(int infile)
@@ -99,18 +124,26 @@ static void initialize_bins(void)
     int bin = 0;
     size_t accum = 0;
     int quant = 0;
+    int use_next_slot = 0;
 
     G_message(_("Computing bins"));
 
     num_values = 0;
     next = get_quantile(quant);
 
+    /* for a given quantile, two bins might be needed
+     * if the index for this quantile is
+     * > accumulated count of current bin
+     * and
+     * < accumulated count of next bin */
+
     for (slot = 0; slot < num_slots; slot++) {
 	size_t count = slots[slot];
 	size_t accum2 = accum + count;
 
-	if (accum2 > next ||
-	    (slot == num_slots - 1 && accum2 == next)) {
+	if (count > 0 &&
+	    (accum2 > next || use_next_slot) &&
+	    bin < num_bins_alloc) {
 	    struct bin *b = &bins[bin];
 
 	    slot_bins[slot] = ++bin;
@@ -121,18 +154,24 @@ static void initialize_bins(void)
 	    b->min = min + slot_size * slot;
 	    b->max = min + slot_size * (slot + 1);
 
-	    while (accum2 > next)
-		next = get_quantile(++quant);
+	    use_next_slot = 0;
+
+	    if (accum2 - next < 1) {
+		use_next_slot = 1;
+	    }
+	    else {
+		while (accum2 > next)
+		    next = get_quantile(++quant);
+	    }
 
 	    num_values += count;
 	}
-
 	accum = accum2;
     }
 
-    num_bins = bin;
+    num_bins_used = bin;
 
-    G_debug(1, "Number of bins: %d", num_bins);
+    G_debug(1, "Number of used bins: %d", num_bins_used);
     G_debug(1, "Number of values: %lu", num_values);
 }
 
@@ -188,7 +227,7 @@ static void sort_bins(void)
 
     G_message(_("Sorting bins"));
 
-    for (bin = 0; bin < num_bins; bin++) {
+    for (bin = 0; bin < num_bins_used; bin++) {
 	struct bin *b = &bins[bin];
 
 	qsort(&values[b->base], b->count, sizeof(DCELL), compare_dcell);
@@ -209,22 +248,17 @@ static void compute_quantiles(int recode)
 	double k, v;
 	size_t i0, i1;
 
-	for (; bin < num_bins; bin++) {
+	for (; bin < num_bins_used; bin++) {
 	    b = &bins[bin];
 	    if (b->origin + b->count >= next)
 		break;
 	}
 
-	if (bin < num_bins) {
+	if (bin < num_bins_used) {
 	    k = next - b->origin;
 	    i0 = (size_t)floor(k);
 	    i1 = (size_t)ceil(k);
 
-	    if (i0 > b->count - 1)
-		i0 = b->count - 1;
-	    if (i1 > b->count - 1)
-		i1 = b->count - 1;
-
 	    v = (i0 == i1)
 		? values[b->base + i0]
 		: values[b->base + i0] * (i1 - k) +
@@ -258,6 +292,7 @@ int main(int argc, char *argv[])
     int recode;
     int infile;
     struct FPRange range;
+    int num_slots_max;
 
     G_gisinit(argv[0]);
 
@@ -301,7 +336,7 @@ int main(int argc, char *argv[])
     flag.r = G_define_flag();
     flag.r->key = 'r';
     flag.r->description = _("Generate recode rules based on quantile-defined intervals");
- 
+
     if (G_parser(argc, argv))
 	exit(EXIT_FAILURE);
 
@@ -341,17 +376,29 @@ int main(int argc, char *argv[])
     Rast_read_fp_range(opt.input->answer, "", &range);
     Rast_get_fp_range_min_max(&range, &min, &max);
 
+    rows = Rast_window_rows();
+    cols = Rast_window_cols();
+
+    /* minimum 1000 values per slot to reduce memory consumption */
+    num_slots_max = ((size_t)rows * cols) / 1000;
+    if (num_slots_max < 1)
+	num_slots_max = 1;
+    if (num_slots > num_slots_max) {
+	G_message(_("Reducing number of bins from %d to %d"),
+	            num_slots, num_slots_max);
+	num_slots = num_slots_max;
+    }
+
     slots = G_calloc(num_slots, sizeof(size_t));
     slot_bins = G_calloc(num_slots, sizeof(unsigned short));
 
     slot_size = (max - min) / num_slots;
 
-    rows = Rast_window_rows();
-    cols = Rast_window_cols();
-
     get_slot_counts(infile);
 
-    bins = G_calloc(num_quants, sizeof(struct bin));
+    /* sometimes two bins are needed to calculate a quantile */
+    num_bins_alloc = num_quants * 2;
+    bins = G_calloc(num_bins_alloc, sizeof(struct bin));
     initialize_bins();
     G_free(slots);
 

+ 19 - 2
raster/r.quantile/r.quantile.html

@@ -3,6 +3,11 @@
 <em>r.quantile</em> computes quantiles in a manner suitable
 for use with large amounts of data. It is using two passes.
 
+<h2>NOTES</h2>
+
+Quantiles are calculated following algorithm 7 from Hyndman and Fan (1996),
+which is also the default in R and numpy.
+
 <h2>EXAMPLE</h2>
 
 Calculation of elevation quantiles (printed to standard-out):
@@ -20,6 +25,18 @@ r.quantile elevation quantiles=5 -r --quiet | r.recode elevation \
            out=elev_quant5 rules=-
 </pre></div>
 
+<h2>REFERENCES</h2>
+
+<ul>
+<li>Hyndman and Fan (1996) <i>Sample Quantiles in Statistical 
+Packages</i>, <b>American Statistician</b>. American Statistical 
+Association. 50 (4): 361-365. DOI: <a 
+href="https://doi.org/10.2307/2684934>10.2307/2684934">10.2307/2684934</a></li>
+<li><a 
+href="https://www.itl.nist.gov/div898/handbook/prc/section2/prc262.htm"><i>Engineering 
+Statistics Handbook: Percentile</i></a>, NIST</li>
+</ul>
+
 <h2>SEE ALSO</h2>
 
 <em>
@@ -35,10 +52,10 @@ r.quantile elevation quantiles=5 -r --quiet | r.recode elevation \
 <a href="v.rast.stats.html">v.rast.stats</a>
 </em>
 
-
 <h2>AUTHOR</h2>
 
-Glynn Clements
+Glynn Clements<br>
+Markus Metz
 <!--
 <p>
 <i>Last changed: $Date$</i>

+ 68 - 20
raster/r.stats.quantile/main.c

@@ -27,13 +27,14 @@ struct bin
 
 struct basecat
 {
-    unsigned int *slots;
-    unsigned long total;
-    int num_values;
+    size_t *slots;
+    size_t total;
+    size_t num_values;
     DCELL min, max, slot_size;
     int num_slots;
     unsigned char *slot_bins;
-    int num_bins;
+    int num_bins_alloc;
+    int num_bins_used;
     struct bin *bins;
     DCELL *values;
     DCELL *quants;
@@ -66,12 +67,38 @@ static inline int get_slot(struct basecat *bc, DCELL c)
     return i;
 }
 
+/* get zero-based rank for quantile */
+/* generic formula for one-based rank
+ * rank = quant * (N + 1 - 2C) + C
+ * with quant = quantile, N = number of values, C = constant
+ * common values for C:
+ * C = 0
+ *   rank = quant * (N + 1)
+ *   recommended by NIST (National Institute of Standards and Technology)
+ *   https://www.itl.nist.gov/div898/handbook/prc/section2/prc262.htm
+ * C = 0.5
+ *   rank = quant * N + 0.5
+ *   Matlab
+ * C = 1
+ *   rank = quant * (N - 1) + 1
+ *   numpy, R, MS Excel, ...
+ *   Noted as an alternative by NIST */
 static inline double get_quantile(struct basecat *bc, int n)
 {
-    if (n >= num_quants)
+    double rnk;
+
+    if (n >= num_quants) {
+	/* stop condition for initialize_bins() */
 	return (double)bc->total + bc->total;
+    }
 
-    return (double)bc->total * quants[n];
+    rnk = quants[n] * (bc->total - 1);
+    if (rnk < 0)
+	rnk = 0;
+    if (rnk > bc->total - 1)
+	rnk = bc->total - 1;
+
+    return rnk;
 }
 
 static void get_slot_counts(int basefile, int coverfile)
@@ -123,6 +150,8 @@ static void get_slot_counts(int basefile, int coverfile)
 	G_fatal_error(_("No cells found where both base and cover are not NULL"));
 
     for (i = 0; i < num_cats; i++) {
+	int num_slots_max;
+
 	bc = &basecats[i];
 
 	bc->num_slots = 0;
@@ -131,9 +160,12 @@ static void get_slot_counts(int basefile, int coverfile)
 	if (bc->max <= bc->min)
 	    continue;
 
-	bc->num_slots = 1;
-	if (bc->total * 10 > (unsigned long) num_slots)
-	    bc->num_slots = num_slots;
+	num_slots_max = bc->total / 1000;
+	if (num_slots_max < 1)
+	    num_slots_max = 1;
+	if (bc->num_slots > num_slots_max) {
+	    bc->num_slots = num_slots_max;
+	}
 
 	bc->slots = G_calloc(bc->num_slots, sizeof(unsigned int));
 	bc->slot_size = (bc->max - bc->min) / bc->num_slots;
@@ -179,23 +211,32 @@ static void initialize_bins(void)
 	double next;
 	int num_values = 0;
 	int bin = 0;
-	unsigned long accum = 0;
+	size_t accum = 0;
 	int quant = 0;
+	int use_next_slot = 0;
 
 	if (bc->num_slots == 0)
 	    continue;
 
-	bc->bins = G_calloc(num_quants, sizeof(struct bin));
+	bc->num_bins_alloc = num_quants * 2;
+	bc->bins = G_calloc(bc->num_bins_alloc, sizeof(struct bin));
 	bc->slot_bins = G_calloc(bc->num_slots, sizeof(unsigned char));
 
 	next = get_quantile(bc, quant);
 
+	/* for a given quantile, two bins might be needed
+	 * if the index for this quantile is
+	 * > accumulated count of current bin
+	 * and
+	 * < accumulated count of next bin */
+
 	for (slot = 0; slot < bc->num_slots; slot++) {
-	    unsigned int count = bc->slots[slot];
-	    unsigned long accum2 = accum + count;
+	    size_t count = bc->slots[slot];
+	    size_t accum2 = accum + count;
 
-	    if (accum2 > next ||
-	        (slot == bc->num_slots - 1 && accum2 == next)) {
+	    if (count > 0 &&
+	        (accum2 > next || use_next_slot) &&
+	        bin < bc->num_bins_alloc) {
 		struct bin *b = &bc->bins[bin];
 
 		bc->slot_bins[slot] = ++bin;
@@ -204,8 +245,15 @@ static void initialize_bins(void)
 		b->base = num_values;
 		b->count = 0;
 
-		while (accum2 > next)
-		    next = get_quantile(bc, ++quant);
+		use_next_slot = 0;
+
+		if (accum2 - next < 1) {
+		    use_next_slot = 1;
+		}
+		else {
+		    while (accum2 > next)
+			next = get_quantile(bc, ++quant);
+		}
 
 		num_values += count;
 	    }
@@ -214,7 +262,7 @@ static void initialize_bins(void)
 	}
 
 	bc->num_values = num_values;
-	bc->num_bins = bin;
+	bc->num_bins_used = bin;
 
 	G_free(bc->slots);
 
@@ -295,7 +343,7 @@ static void sort_bins(void)
 
 	G_free(bc->slot_bins);
 
-	for (bin = 0; bin < bc->num_bins; bin++) {
+	for (bin = 0; bin < bc->num_bins_used; bin++) {
 	    struct bin *b = &bc->bins[bin];
 
 	    qsort(&bc->values[b->base], b->count, sizeof(DCELL), compare_dcell);
@@ -649,7 +697,7 @@ int main(int argc, char *argv[])
     if (print) {
 	/* get field separator */
 	fs = G_option_to_separator(opt.fs);
-	
+
 	print_quantiles(fs, opt.file->answer, flag.t->answer);
     }
     else if (reclass)

+ 16 - 1
raster/r.stats.quantile/r.stats.quantile.html

@@ -13,6 +13,10 @@ for floating-point cover maps. It provides quantile calculations,
 which are absent from
 <em><a href="r.stats.zonal.html">r.stats.zonal</a></em>.
 
+<p>
+Quantiles are calculated following algorithm 7 from Hyndman and Fan (1996),
+which is also the default in R and numpy.
+
 <h2>EXAMPLE</h2>
 
 In this example, the raster polygon map <tt>zipcodes</tt> in the North 
@@ -37,6 +41,18 @@ r.stats.quantile base=zipcodes cover=elevation percentiles=25,50,75 \
   output=zipcodes_elev_q25,zipcodes_elev_q50,zipcodes_elev_q75
 </pre></div>
 
+<h2>REFERENCES</h2>
+
+<ul>
+<li>Hyndman and Fan (1996) <i>Sample Quantiles in Statistical 
+Packages</i>, <b>American Statistician</b>. American Statistical 
+Association. 50 (4): 361-365. DOI: <a 
+href="https://doi.org/10.2307/2684934>10.2307/2684934">10.2307/2684934</a></li>
+<li><a 
+href="https://www.itl.nist.gov/div898/handbook/prc/section2/prc262.htm"><i>Engineering 
+Statistics Handbook: Percentile</i></a>, NIST</li>
+</ul>
+
 <h2>SEE ALSO</h2>
 
 <em>
@@ -45,7 +61,6 @@ r.stats.quantile base=zipcodes cover=elevation percentiles=25,50,75 \
 <a href="r.statistics.html">r.statistics</a>
 </em>
 
-
 <h2>AUTHOR</h2>
 
 Glynn Clements<br>