123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307 |
- #!/usr/bin/env python3
- ############################################################################
- #
- # MODULE: db.univar (formerly called v.univar.sh)
- # AUTHOR(S): Michael Barton, Arizona State University
- # Converted to Python by Glynn Clements
- # Sync'ed to r.univar by Markus Metz
- # PURPOSE: Calculates univariate statistics from a GRASS vector map attribute column.
- # Based on r.univar.sh by Markus Neteler
- # COPYRIGHT: (C) 2005, 2007, 2008 by the GRASS Development Team
- #
- # This program is free software under the GNU General Public
- # License (>=v2). Read the file COPYING that comes with GRASS
- # for details.
- #
- #############################################################################
- # %module
- # % description: Calculates univariate statistics on selected table column.
- # % keyword: database
- # % keyword: statistics
- # % keyword: attribute table
- # %end
- # %option G_OPT_DB_TABLE
- # % key: table
- # % required: yes
- # %end
- # %option G_OPT_DB_COLUMN
- # % description: Name of attribute column on which to calculate statistics (must be numeric)
- # % required: yes
- # %end
- # %option G_OPT_DB_DATABASE
- # %end
- # %option G_OPT_DB_DRIVER
- # % options: dbf,odbc,ogr,sqlite,pg
- # %end
- # %option G_OPT_DB_WHERE
- # %end
- # %option
- # % key: percentile
- # % type: double
- # % description: Percentile to calculate (requires extended statistics flag)
- # % required : no
- # % answer: 90
- # % options: 0-100
- # % multiple: yes
- # %end
- # %flag
- # % key: e
- # % description: Extended statistics (quartiles and 90th percentile)
- # %end
- # %flag
- # % key: g
- # % description: Print stats in shell script style
- # %end
- import sys
- import atexit
- import math
- import grass.script as gscript
- def cleanup():
- for ext in ["", ".sort"]:
- gscript.try_remove(tmp + ext)
- def sortfile(infile, outfile):
- inf = open(infile, "r")
- outf = open(outfile, "w")
- if gscript.find_program("sort", "--help"):
- gscript.run_command("sort", flags="n", stdin=inf, stdout=outf)
- else:
- # FIXME: we need a large-file sorting function
- gscript.warning(_("'sort' not found: sorting in memory"))
- lines = inf.readlines()
- for i in range(len(lines)):
- lines[i] = float(lines[i].rstrip("\r\n"))
- lines.sort()
- for line in lines:
- outf.write(str(line) + "\n")
- inf.close()
- outf.close()
- def main():
- global tmp
- tmp = gscript.tempfile()
- extend = flags["e"]
- shellstyle = flags["g"]
- table = options["table"]
- column = options["column"]
- database = options["database"]
- driver = options["driver"]
- where = options["where"]
- perc = options["percentile"]
- perc = [float(p) for p in perc.split(",")]
- desc_table = gscript.db_describe(table, database=database, driver=driver)
- if not desc_table:
- gscript.fatal(_("Unable to describe table <%s>") % table)
- found = False
- for cname, ctype, cwidth in desc_table["cols"]:
- if cname == column:
- found = True
- if ctype not in ("INTEGER", "DOUBLE PRECISION"):
- gscript.fatal(_("Column <%s> is not numeric") % cname)
- if not found:
- gscript.fatal(_("Column <%s> not found in table <%s>") % (column, table))
- if not shellstyle:
- gscript.verbose(
- _("Calculation for column <%s> of table <%s>...") % (column, table)
- )
- gscript.message(_("Reading column values..."))
- sql = "SELECT %s FROM %s" % (column, table)
- if where:
- sql += " WHERE " + where
- if not database:
- database = None
- if not driver:
- driver = None
- tmpf = open(tmp, "w")
- gscript.run_command(
- "db.select",
- flags="c",
- table=table,
- database=database,
- driver=driver,
- sql=sql,
- stdout=tmpf,
- )
- tmpf.close()
- # check if result is empty
- tmpf = open(tmp)
- if tmpf.read(1) == "":
- gscript.fatal(_("Table <%s> contains no data.") % table)
- tmpf.close()
- # calculate statistics
- if not shellstyle:
- gscript.verbose(_("Calculating statistics..."))
- N = 0
- sum = 0.0
- sum2 = 0.0
- sum3 = 0.0
- minv = 1e300
- maxv = -1e300
- tmpf = open(tmp)
- for line in tmpf:
- line = line.rstrip("\r\n")
- if len(line) == 0:
- continue
- x = float(line)
- N += 1
- sum += x
- sum2 += x * x
- sum3 += abs(x)
- maxv = max(maxv, x)
- minv = min(minv, x)
- tmpf.close()
- if N <= 0:
- gscript.fatal(_("No non-null values found"))
- if not shellstyle:
- sys.stdout.write("Number of values: %d\n" % N)
- sys.stdout.write("Minimum: %.15g\n" % minv)
- sys.stdout.write("Maximum: %.15g\n" % maxv)
- sys.stdout.write("Range: %.15g\n" % (maxv - minv))
- sys.stdout.write("Mean: %.15g\n" % (sum / N))
- sys.stdout.write("Arithmetic mean of absolute values: %.15g\n" % (sum3 / N))
- if not ((sum2 - sum * sum / N) / N) < 0:
- sys.stdout.write("Variance: %.15g\n" % ((sum2 - sum * sum / N) / N))
- sys.stdout.write(
- "Standard deviation: %.15g\n" % (math.sqrt((sum2 - sum * sum / N) / N))
- )
- sys.stdout.write(
- "Coefficient of variation: %.15g\n"
- % ((math.sqrt((sum2 - sum * sum / N) / N)) / (math.sqrt(sum * sum) / N))
- )
- else:
- sys.stdout.write("Variance: 0\n")
- sys.stdout.write("Standard deviation: 0\n")
- sys.stdout.write("Coefficient of variation: 0\n")
- sys.stdout.write("Sum: %.15g\n" % sum)
- else:
- sys.stdout.write("n=%d\n" % N)
- sys.stdout.write("min=%.15g\n" % minv)
- sys.stdout.write("max=%.15g\n" % maxv)
- sys.stdout.write("range=%.15g\n" % (maxv - minv))
- sys.stdout.write("mean=%.15g\n" % (sum / N))
- sys.stdout.write("mean_abs=%.15g\n" % (sum3 / N))
- if not ((sum2 - sum * sum / N) / N) < 0:
- sys.stdout.write("variance=%.15g\n" % ((sum2 - sum * sum / N) / N))
- sys.stdout.write("stddev=%.15g\n" % (math.sqrt((sum2 - sum * sum / N) / N)))
- sys.stdout.write(
- "coeff_var=%.15g\n"
- % ((math.sqrt((sum2 - sum * sum / N) / N)) / (math.sqrt(sum * sum) / N))
- )
- else:
- sys.stdout.write("variance=0\n")
- sys.stdout.write("stddev=0\n")
- sys.stdout.write("coeff_var=0\n")
- sys.stdout.write("sum=%.15g\n" % sum)
- if not extend:
- return
- # preparations:
- sortfile(tmp, tmp + ".sort")
- odd = N % 2
- eostr = ["even", "odd"][odd]
- q25pos = round(N * 0.25)
- if q25pos == 0:
- q25pos = 1
- q50apos = round(N * 0.50)
- if q50apos == 0:
- q50apos = 1
- q50bpos = q50apos + (1 - odd)
- q75pos = round(N * 0.75)
- if q75pos == 0:
- q75pos = 1
- ppos = {}
- pval = {}
- for i in range(len(perc)):
- ppos[i] = round(N * perc[i] / 100)
- if ppos[i] == 0:
- ppos[i] = 1
- pval[i] = 0
- inf = open(tmp + ".sort")
- l = 1
- for line in inf:
- line = line.rstrip("\r\n")
- if len(line) == 0:
- continue
- if l == q25pos:
- q25 = float(line)
- if l == q50apos:
- q50a = float(line)
- if l == q50bpos:
- q50b = float(line)
- if l == q75pos:
- q75 = float(line)
- for i in range(len(ppos)):
- if l == ppos[i]:
- pval[i] = float(line)
- l += 1
- q50 = (q50a + q50b) / 2
- if not shellstyle:
- sys.stdout.write("1st Quartile: %.15g\n" % q25)
- sys.stdout.write("Median (%s N): %.15g\n" % (eostr, q50))
- sys.stdout.write("3rd Quartile: %.15g\n" % q75)
- for i in range(len(perc)):
- if perc[i] == int(perc[i]): # integer
- if int(perc[i]) % 10 == 1 and int(perc[i]) != 11:
- sys.stdout.write(
- "%dst Percentile: %.15g\n" % (int(perc[i]), pval[i])
- )
- elif int(perc[i]) % 10 == 2 and int(perc[i]) != 12:
- sys.stdout.write(
- "%dnd Percentile: %.15g\n" % (int(perc[i]), pval[i])
- )
- elif int(perc[i]) % 10 == 3 and int(perc[i]) != 13:
- sys.stdout.write(
- "%drd Percentile: %.15g\n" % (int(perc[i]), pval[i])
- )
- else:
- sys.stdout.write(
- "%dth Percentile: %.15g\n" % (int(perc[i]), pval[i])
- )
- else:
- sys.stdout.write("%.15g Percentile: %.15g\n" % (perc[i], pval[i]))
- else:
- sys.stdout.write("first_quartile=%.15g\n" % q25)
- sys.stdout.write("median=%.15g\n" % q50)
- sys.stdout.write("third_quartile=%.15g\n" % q75)
- for i in range(len(perc)):
- percstr = "%.15g" % perc[i]
- percstr = percstr.replace(".", "_")
- sys.stdout.write("percentile_%s=%.15g\n" % (percstr, pval[i]))
- if __name__ == "__main__":
- options, flags = gscript.parser()
- atexit.register(cleanup)
- main()
|