db.univar.py 9.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307
  1. #!/usr/bin/env python3
  2. ############################################################################
  3. #
  4. # MODULE: db.univar (formerly called v.univar.sh)
  5. # AUTHOR(S): Michael Barton, Arizona State University
  6. # Converted to Python by Glynn Clements
  7. # Sync'ed to r.univar by Markus Metz
  8. # PURPOSE: Calculates univariate statistics from a GRASS vector map attribute column.
  9. # Based on r.univar.sh by Markus Neteler
  10. # COPYRIGHT: (C) 2005, 2007, 2008 by the GRASS Development Team
  11. #
  12. # This program is free software under the GNU General Public
  13. # License (>=v2). Read the file COPYING that comes with GRASS
  14. # for details.
  15. #
  16. #############################################################################
  17. # %module
  18. # % description: Calculates univariate statistics on selected table column.
  19. # % keyword: database
  20. # % keyword: statistics
  21. # % keyword: attribute table
  22. # %end
  23. # %option G_OPT_DB_TABLE
  24. # % key: table
  25. # % required: yes
  26. # %end
  27. # %option G_OPT_DB_COLUMN
  28. # % description: Name of attribute column on which to calculate statistics (must be numeric)
  29. # % required: yes
  30. # %end
  31. # %option G_OPT_DB_DATABASE
  32. # %end
  33. # %option G_OPT_DB_DRIVER
  34. # % options: dbf,odbc,ogr,sqlite,pg
  35. # %end
  36. # %option G_OPT_DB_WHERE
  37. # %end
  38. # %option
  39. # % key: percentile
  40. # % type: double
  41. # % description: Percentile to calculate (requires extended statistics flag)
  42. # % required : no
  43. # % answer: 90
  44. # % options: 0-100
  45. # % multiple: yes
  46. # %end
  47. # %flag
  48. # % key: e
  49. # % description: Extended statistics (quartiles and 90th percentile)
  50. # %end
  51. # %flag
  52. # % key: g
  53. # % description: Print stats in shell script style
  54. # %end
  55. import sys
  56. import atexit
  57. import math
  58. import grass.script as gscript
  59. def cleanup():
  60. for ext in ["", ".sort"]:
  61. gscript.try_remove(tmp + ext)
  62. def sortfile(infile, outfile):
  63. inf = open(infile, "r")
  64. outf = open(outfile, "w")
  65. if gscript.find_program("sort", "--help"):
  66. gscript.run_command("sort", flags="n", stdin=inf, stdout=outf)
  67. else:
  68. # FIXME: we need a large-file sorting function
  69. gscript.warning(_("'sort' not found: sorting in memory"))
  70. lines = inf.readlines()
  71. for i in range(len(lines)):
  72. lines[i] = float(lines[i].rstrip("\r\n"))
  73. lines.sort()
  74. for line in lines:
  75. outf.write(str(line) + "\n")
  76. inf.close()
  77. outf.close()
  78. def main():
  79. global tmp
  80. tmp = gscript.tempfile()
  81. extend = flags["e"]
  82. shellstyle = flags["g"]
  83. table = options["table"]
  84. column = options["column"]
  85. database = options["database"]
  86. driver = options["driver"]
  87. where = options["where"]
  88. perc = options["percentile"]
  89. perc = [float(p) for p in perc.split(",")]
  90. desc_table = gscript.db_describe(table, database=database, driver=driver)
  91. if not desc_table:
  92. gscript.fatal(_("Unable to describe table <%s>") % table)
  93. found = False
  94. for cname, ctype, cwidth in desc_table["cols"]:
  95. if cname == column:
  96. found = True
  97. if ctype not in ("INTEGER", "DOUBLE PRECISION"):
  98. gscript.fatal(_("Column <%s> is not numeric") % cname)
  99. if not found:
  100. gscript.fatal(_("Column <%s> not found in table <%s>") % (column, table))
  101. if not shellstyle:
  102. gscript.verbose(
  103. _("Calculation for column <%s> of table <%s>...") % (column, table)
  104. )
  105. gscript.message(_("Reading column values..."))
  106. sql = "SELECT %s FROM %s" % (column, table)
  107. if where:
  108. sql += " WHERE " + where
  109. if not database:
  110. database = None
  111. if not driver:
  112. driver = None
  113. tmpf = open(tmp, "w")
  114. gscript.run_command(
  115. "db.select",
  116. flags="c",
  117. table=table,
  118. database=database,
  119. driver=driver,
  120. sql=sql,
  121. stdout=tmpf,
  122. )
  123. tmpf.close()
  124. # check if result is empty
  125. tmpf = open(tmp)
  126. if tmpf.read(1) == "":
  127. gscript.fatal(_("Table <%s> contains no data.") % table)
  128. tmpf.close()
  129. # calculate statistics
  130. if not shellstyle:
  131. gscript.verbose(_("Calculating statistics..."))
  132. N = 0
  133. sum = 0.0
  134. sum2 = 0.0
  135. sum3 = 0.0
  136. minv = 1e300
  137. maxv = -1e300
  138. tmpf = open(tmp)
  139. for line in tmpf:
  140. line = line.rstrip("\r\n")
  141. if len(line) == 0:
  142. continue
  143. x = float(line)
  144. N += 1
  145. sum += x
  146. sum2 += x * x
  147. sum3 += abs(x)
  148. maxv = max(maxv, x)
  149. minv = min(minv, x)
  150. tmpf.close()
  151. if N <= 0:
  152. gscript.fatal(_("No non-null values found"))
  153. if not shellstyle:
  154. sys.stdout.write("Number of values: %d\n" % N)
  155. sys.stdout.write("Minimum: %.15g\n" % minv)
  156. sys.stdout.write("Maximum: %.15g\n" % maxv)
  157. sys.stdout.write("Range: %.15g\n" % (maxv - minv))
  158. sys.stdout.write("Mean: %.15g\n" % (sum / N))
  159. sys.stdout.write("Arithmetic mean of absolute values: %.15g\n" % (sum3 / N))
  160. if not ((sum2 - sum * sum / N) / N) < 0:
  161. sys.stdout.write("Variance: %.15g\n" % ((sum2 - sum * sum / N) / N))
  162. sys.stdout.write(
  163. "Standard deviation: %.15g\n" % (math.sqrt((sum2 - sum * sum / N) / N))
  164. )
  165. sys.stdout.write(
  166. "Coefficient of variation: %.15g\n"
  167. % ((math.sqrt((sum2 - sum * sum / N) / N)) / (math.sqrt(sum * sum) / N))
  168. )
  169. else:
  170. sys.stdout.write("Variance: 0\n")
  171. sys.stdout.write("Standard deviation: 0\n")
  172. sys.stdout.write("Coefficient of variation: 0\n")
  173. sys.stdout.write("Sum: %.15g\n" % sum)
  174. else:
  175. sys.stdout.write("n=%d\n" % N)
  176. sys.stdout.write("min=%.15g\n" % minv)
  177. sys.stdout.write("max=%.15g\n" % maxv)
  178. sys.stdout.write("range=%.15g\n" % (maxv - minv))
  179. sys.stdout.write("mean=%.15g\n" % (sum / N))
  180. sys.stdout.write("mean_abs=%.15g\n" % (sum3 / N))
  181. if not ((sum2 - sum * sum / N) / N) < 0:
  182. sys.stdout.write("variance=%.15g\n" % ((sum2 - sum * sum / N) / N))
  183. sys.stdout.write("stddev=%.15g\n" % (math.sqrt((sum2 - sum * sum / N) / N)))
  184. sys.stdout.write(
  185. "coeff_var=%.15g\n"
  186. % ((math.sqrt((sum2 - sum * sum / N) / N)) / (math.sqrt(sum * sum) / N))
  187. )
  188. else:
  189. sys.stdout.write("variance=0\n")
  190. sys.stdout.write("stddev=0\n")
  191. sys.stdout.write("coeff_var=0\n")
  192. sys.stdout.write("sum=%.15g\n" % sum)
  193. if not extend:
  194. return
  195. # preparations:
  196. sortfile(tmp, tmp + ".sort")
  197. odd = N % 2
  198. eostr = ["even", "odd"][odd]
  199. q25pos = round(N * 0.25)
  200. if q25pos == 0:
  201. q25pos = 1
  202. q50apos = round(N * 0.50)
  203. if q50apos == 0:
  204. q50apos = 1
  205. q50bpos = q50apos + (1 - odd)
  206. q75pos = round(N * 0.75)
  207. if q75pos == 0:
  208. q75pos = 1
  209. ppos = {}
  210. pval = {}
  211. for i in range(len(perc)):
  212. ppos[i] = round(N * perc[i] / 100)
  213. if ppos[i] == 0:
  214. ppos[i] = 1
  215. pval[i] = 0
  216. inf = open(tmp + ".sort")
  217. l = 1
  218. for line in inf:
  219. line = line.rstrip("\r\n")
  220. if len(line) == 0:
  221. continue
  222. if l == q25pos:
  223. q25 = float(line)
  224. if l == q50apos:
  225. q50a = float(line)
  226. if l == q50bpos:
  227. q50b = float(line)
  228. if l == q75pos:
  229. q75 = float(line)
  230. for i in range(len(ppos)):
  231. if l == ppos[i]:
  232. pval[i] = float(line)
  233. l += 1
  234. q50 = (q50a + q50b) / 2
  235. if not shellstyle:
  236. sys.stdout.write("1st Quartile: %.15g\n" % q25)
  237. sys.stdout.write("Median (%s N): %.15g\n" % (eostr, q50))
  238. sys.stdout.write("3rd Quartile: %.15g\n" % q75)
  239. for i in range(len(perc)):
  240. if perc[i] == int(perc[i]): # integer
  241. if int(perc[i]) % 10 == 1 and int(perc[i]) != 11:
  242. sys.stdout.write(
  243. "%dst Percentile: %.15g\n" % (int(perc[i]), pval[i])
  244. )
  245. elif int(perc[i]) % 10 == 2 and int(perc[i]) != 12:
  246. sys.stdout.write(
  247. "%dnd Percentile: %.15g\n" % (int(perc[i]), pval[i])
  248. )
  249. elif int(perc[i]) % 10 == 3 and int(perc[i]) != 13:
  250. sys.stdout.write(
  251. "%drd Percentile: %.15g\n" % (int(perc[i]), pval[i])
  252. )
  253. else:
  254. sys.stdout.write(
  255. "%dth Percentile: %.15g\n" % (int(perc[i]), pval[i])
  256. )
  257. else:
  258. sys.stdout.write("%.15g Percentile: %.15g\n" % (perc[i], pval[i]))
  259. else:
  260. sys.stdout.write("first_quartile=%.15g\n" % q25)
  261. sys.stdout.write("median=%.15g\n" % q50)
  262. sys.stdout.write("third_quartile=%.15g\n" % q75)
  263. for i in range(len(perc)):
  264. percstr = "%.15g" % perc[i]
  265. percstr = percstr.replace(".", "_")
  266. sys.stdout.write("percentile_%s=%.15g\n" % (percstr, pval[i]))
  267. if __name__ == "__main__":
  268. options, flags = gscript.parser()
  269. atexit.register(cleanup)
  270. main()