#!/usr/bin/env python ############################################################################ # # MODULE: db.univar (formerly called v.univar.sh) # AUTHOR(S): Michael Barton, Arizona State University # Converted to Python by Glynn Clements # Sync'ed to r.univar by Markus Metz # PURPOSE: Calculates univariate statistics from a GRASS vector map attribute column. # Based on r.univar.sh by Markus Neteler # COPYRIGHT: (C) 2005, 2007, 2008 by the GRASS Development Team # # This program is free software under the GNU General Public # License (>=v2). Read the file COPYING that comes with GRASS # for details. # ############################################################################# #%module #% description: Calculates univariate statistics on selected table column. #% keyword: database #% keyword: statistics #% keyword: attribute table #%end #%option G_OPT_DB_TABLE #% key: table #% required: yes #%end #%option G_OPT_DB_COLUMN #% description: Name of attribute column on which to calculate statistics (must be numeric) #% required: yes #%end #%option G_OPT_DB_DATABASE #%end #%option G_OPT_DB_DRIVER #% options: dbf,odbc,ogr,sqlite,pg #%end #%option G_OPT_DB_WHERE #%end #%option #% key: percentile #% type: double #% description: Percentile to calculate (requires extended statistics flag) #% required : no #% answer: 90 #% options: 0-100 #% multiple: yes #%end #%flag #% key: e #% description: Extended statistics (quartiles and 90th percentile) #%end #%flag #% key: g #% description: Print stats in shell script style #%end import sys import os import atexit import math import grass.script as grass def cleanup(): for ext in ['', '.sort']: grass.try_remove(tmp + ext) def sortfile(infile, outfile): inf = file(infile, 'r') outf = file(outfile, 'w') if grass.find_program('sort', '--help'): grass.run_command('sort', flags = 'n', stdin = inf, stdout = outf) else: # FIXME: we need a large-file sorting function grass.warning(_("'sort' not found: sorting in memory")) lines = inf.readlines() for i in range(len(lines)): lines[i] = float(lines[i].rstrip('\r\n')) lines.sort() for line in lines: outf.write(str(line) + '\n') inf.close() outf.close() def main(): global tmp tmp = grass.tempfile() extend = flags['e'] shellstyle = flags['g'] table = options['table'] column = options['column'] database = options['database'] driver = options['driver'] where = options['where'] perc = options['percentile'] perc = [float(p) for p in perc.split(',')] desc_table = grass.db_describe(table, database=database, driver=driver) if not desc_table: grass.fatal(_("Unable to describe table <%s>") % table) found = False for cname, ctype, cwidth in desc_table['cols']: if cname == column: found = True if ctype not in ('INTEGER', 'DOUBLE PRECISION'): grass.fatal(_("Column <%s> is not numeric") % cname) if not found: grass.fatal(_("Column <%s> not found in table <%s>") % (column, table)) if not shellstyle: grass.verbose(_("Calculation for column <%s> of table <%s>...") % (column, table)) grass.message(_("Reading column values...")) sql = "SELECT %s FROM %s" % (column, table) if where: sql += " WHERE " + where if not database: database = None if not driver: driver = None tmpf = file(tmp, 'w') grass.run_command('db.select', flags = 'c', table = table, database = database, driver = driver, sql = sql, stdout = tmpf) tmpf.close() # check if result is empty tmpf = file(tmp) if tmpf.read(1) == '': grass.fatal(_("Table <%s> contains no data.") % table) tmpf.close() # calculate statistics if not shellstyle: grass.verbose(_("Calculating statistics...")) N = 0 sum = 0.0 sum2 = 0.0 sum3 = 0.0 minv = 1e300 maxv = -1e300 tmpf = file(tmp) for line in tmpf: if len(line.rstrip('\r\n')) == 0: continue x = float(line.rstrip('\r\n')) N += 1 sum += x sum2 += x * x sum3 += abs(x) maxv = max(maxv, x) minv = min(minv, x) tmpf.close() if N <= 0: grass.fatal(_("No non-null values found")) if not shellstyle: sys.stdout.write("Number of values: %d\n"% N) sys.stdout.write("Minimum: %.15g\n"% minv) sys.stdout.write("Maximum: %.15g\n"% maxv) sys.stdout.write("Range: %.15g\n"% (maxv - minv)) sys.stdout.write("Mean: %.15g\n"% (sum/N)) sys.stdout.write("Arithmetic mean of absolute values: %.15g\n"% (sum3/N)) sys.stdout.write("Variance: %.15g\n"% ((sum2 - sum*sum/N)/N)) sys.stdout.write("Standard deviation: %.15g\n"% (math.sqrt((sum2 - sum*sum/N)/N))) sys.stdout.write("Coefficient of variation: %.15g\n"% ((math.sqrt((sum2 - sum*sum/N)/N))/(math.sqrt(sum*sum)/N))) sys.stdout.write("Sum: %.15g\n"% sum) else: sys.stdout.write("n=%d\n"% N) sys.stdout.write("min=%.15g\n"% minv) sys.stdout.write("max=%.15g\n"% maxv) sys.stdout.write("range=%.15g\n"% (maxv - minv)) sys.stdout.write("mean=%.15g\n"% (sum/N)) sys.stdout.write("mean_abs=%.15g\n"% (sum3/N)) sys.stdout.write("variance=%.15g\n"% ((sum2 - sum*sum/N)/N)) sys.stdout.write("stddev=%.15g\n"% (math.sqrt((sum2 - sum*sum/N)/N))) sys.stdout.write("coeff_var=%.15g\n"% ((math.sqrt((sum2 - sum*sum/N)/N))/(math.sqrt(sum*sum)/N))) sys.stdout.write("sum=%.15g\n"% sum) if not extend: return # preparations: sortfile(tmp, tmp + ".sort") number = N odd = N % 2 eostr = ['even','odd'][odd] q25pos = round(N * 0.25) if q25pos == 0: q25pos = 1 q50apos = round(N * 0.50) if q50apos == 0: q50apos = 1 q50bpos = q50apos + (1 - odd) q75pos = round(N * 0.75) if q75pos == 0: q75pos = 1 ppos = {} pval = {} for i in range(len(perc)): ppos[i] = round(N * perc[i] / 100) if ppos[i] == 0: ppos[i] = 1 pval[i] = 0 inf = file(tmp + ".sort") l = 1 for line in inf: if l == q25pos: q25 = float(line.rstrip('\r\n')) if l == q50apos: q50a = float(line.rstrip('\r\n')) if l == q50bpos: q50b = float(line.rstrip('\r\n')) if l == q75pos: q75 = float(line.rstrip('\r\n')) for i in range(len(ppos)): if l == ppos[i]: pval[i] = float(line.rstrip('\r\n')) l += 1 q50 = (q50a + q50b) / 2 if not shellstyle: sys.stdout.write("1st Quartile: %.15g\n" % q25) sys.stdout.write("Median (%s N): %.15g\n" % (eostr, q50)) sys.stdout.write("3rd Quartile: %.15g\n" % q75) for i in range(len(perc)): if perc[i] == int(perc[i]): # integer if int(perc[i]) % 10 == 1 and int(perc[i]) != 11: sys.stdout.write("%dst Percentile: %.15g\n"% (int(perc[i]), pval[i])) elif int(perc[i]) % 10 == 2 and int(perc[i]) != 12: sys.stdout.write("%dnd Percentile: %.15g\n"% (int(perc[i]), pval[i])) elif int(perc[i]) % 10 == 3 and int(perc[i]) != 13: sys.stdout.write("%drd Percentile: %.15g\n"% (int(perc[i]), pval[i])) else: sys.stdout.write("%dth Percentile: %.15g\n"% (int(perc[i]), pval[i])) else: sys.stdout.write("%.15g Percentile: %.15g\n"% (perc[i], pval[i])) else: sys.stdout.write("first_quartile=%.15g\n"% q25) sys.stdout.write("median=%.15g\n"% q50) sys.stdout.write("third_quartile=%.15g\n"% q75) for i in range(len(perc)): percstr = "%.15g" % perc[i] percstr = percstr.replace('.','_') sys.stdout.write("percentile_%s=%.15g\n"% (percstr, pval[i])) if __name__ == "__main__": options, flags = grass.parser() atexit.register(cleanup) main()