db.univar.py 8.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305
  1. #!/usr/bin/env python
  2. ############################################################################
  3. #
  4. # MODULE: db.univar (formerly called v.univar.sh)
  5. # AUTHOR(S): Michael Barton, Arizona State University
  6. # Converted to Python by Glynn Clements
  7. # Sync'ed to r.univar by Markus Metz
  8. # PURPOSE: Calculates univariate statistics from a GRASS vector map attribute column.
  9. # Based on r.univar.sh by Markus Neteler
  10. # COPYRIGHT: (C) 2005, 2007, 2008 by the GRASS Development Team
  11. #
  12. # This program is free software under the GNU General Public
  13. # License (>=v2). Read the file COPYING that comes with GRASS
  14. # for details.
  15. #
  16. #############################################################################
  17. #%module
  18. #% description: Calculates univariate statistics on selected table column.
  19. #% keyword: database
  20. #% keyword: statistics
  21. #% keyword: attribute table
  22. #%end
  23. #%option G_OPT_DB_TABLE
  24. #% key: table
  25. #% required: yes
  26. #%end
  27. #%option G_OPT_DB_COLUMN
  28. #% description: Name of attribute column on which to calculate statistics (must be numeric)
  29. #% required: yes
  30. #%end
  31. #%option G_OPT_DB_DATABASE
  32. #%end
  33. #%option G_OPT_DB_DRIVER
  34. #% options: dbf,odbc,ogr,sqlite,pg
  35. #%end
  36. #%option G_OPT_DB_WHERE
  37. #%end
  38. #%option
  39. #% key: percentile
  40. #% type: double
  41. #% description: Percentile to calculate (requires extended statistics flag)
  42. #% required : no
  43. #% answer: 90
  44. #% options: 0-100
  45. #% multiple: yes
  46. #%end
  47. #%flag
  48. #% key: e
  49. #% description: Extended statistics (quartiles and 90th percentile)
  50. #%end
  51. #%flag
  52. #% key: g
  53. #% description: Print stats in shell script style
  54. #%end
  55. import sys
  56. import atexit
  57. import math
  58. import grass.script as gscript
  59. # i18N
  60. import os
  61. import gettext
  62. gettext.install('grassmods', os.path.join(os.getenv("GISBASE"), 'locale'))
  63. def cleanup():
  64. for ext in ['', '.sort']:
  65. gscript.try_remove(tmp + ext)
  66. def sortfile(infile, outfile):
  67. inf = open(infile, 'r')
  68. outf = open(outfile, 'w')
  69. if gscript.find_program('sort', '--help'):
  70. gscript.run_command('sort', flags='n', stdin=inf, stdout=outf)
  71. else:
  72. # FIXME: we need a large-file sorting function
  73. gscript.warning(_("'sort' not found: sorting in memory"))
  74. lines = inf.readlines()
  75. for i in range(len(lines)):
  76. lines[i] = float(lines[i].rstrip('\r\n'))
  77. lines.sort()
  78. for line in lines:
  79. outf.write(str(line) + '\n')
  80. inf.close()
  81. outf.close()
  82. def main():
  83. global tmp
  84. tmp = gscript.tempfile()
  85. extend = flags['e']
  86. shellstyle = flags['g']
  87. table = options['table']
  88. column = options['column']
  89. database = options['database']
  90. driver = options['driver']
  91. where = options['where']
  92. perc = options['percentile']
  93. perc = [float(p) for p in perc.split(',')]
  94. desc_table = gscript.db_describe(table, database=database, driver=driver)
  95. if not desc_table:
  96. gscript.fatal(_("Unable to describe table <%s>") % table)
  97. found = False
  98. for cname, ctype, cwidth in desc_table['cols']:
  99. if cname == column:
  100. found = True
  101. if ctype not in ('INTEGER', 'DOUBLE PRECISION'):
  102. gscript.fatal(_("Column <%s> is not numeric") % cname)
  103. if not found:
  104. gscript.fatal(_("Column <%s> not found in table <%s>") % (column, table))
  105. if not shellstyle:
  106. gscript.verbose(_("Calculation for column <%s> of table <%s>..."
  107. ) % (column, table))
  108. gscript.message(_("Reading column values..."))
  109. sql = "SELECT %s FROM %s" % (column, table)
  110. if where:
  111. sql += " WHERE " + where
  112. if not database:
  113. database = None
  114. if not driver:
  115. driver = None
  116. tmpf = open(tmp, 'w')
  117. gscript.run_command('db.select', flags='c', table=table,
  118. database=database, driver=driver, sql=sql,
  119. stdout=tmpf)
  120. tmpf.close()
  121. # check if result is empty
  122. tmpf = open(tmp)
  123. if tmpf.read(1) == '':
  124. gscript.fatal(_("Table <%s> contains no data.") % table)
  125. tmpf.close()
  126. # calculate statistics
  127. if not shellstyle:
  128. gscript.verbose(_("Calculating statistics..."))
  129. N = 0
  130. sum = 0.0
  131. sum2 = 0.0
  132. sum3 = 0.0
  133. minv = 1e300
  134. maxv = -1e300
  135. tmpf = open(tmp)
  136. for line in tmpf:
  137. if len(line.rstrip('\r\n')) == 0:
  138. continue
  139. x = float(line.rstrip('\r\n'))
  140. N += 1
  141. sum += x
  142. sum2 += x * x
  143. sum3 += abs(x)
  144. maxv = max(maxv, x)
  145. minv = min(minv, x)
  146. tmpf.close()
  147. if N <= 0:
  148. gscript.fatal(_("No non-null values found"))
  149. if not shellstyle:
  150. sys.stdout.write("Number of values: %d\n" % N)
  151. sys.stdout.write("Minimum: %.15g\n" % minv)
  152. sys.stdout.write("Maximum: %.15g\n" % maxv)
  153. sys.stdout.write("Range: %.15g\n" % (maxv - minv))
  154. sys.stdout.write("Mean: %.15g\n" % (sum / N))
  155. sys.stdout.write(
  156. "Arithmetic mean of absolute values: %.15g\n" %
  157. (sum3 / N))
  158. sys.stdout.write("Variance: %.15g\n" % ((sum2 - sum * sum / N) / N))
  159. sys.stdout.write(
  160. "Standard deviation: %.15g\n" %
  161. (math.sqrt((sum2 - sum * sum / N) / N)))
  162. sys.stdout.write(
  163. "Coefficient of variation: %.15g\n" %
  164. ((math.sqrt((sum2 - sum * sum / N) / N)) /
  165. (math.sqrt(sum * sum) / N)))
  166. sys.stdout.write("Sum: %.15g\n" % sum)
  167. else:
  168. sys.stdout.write("n=%d\n" % N)
  169. sys.stdout.write("min=%.15g\n" % minv)
  170. sys.stdout.write("max=%.15g\n" % maxv)
  171. sys.stdout.write("range=%.15g\n" % (maxv - minv))
  172. sys.stdout.write("mean=%.15g\n" % (sum / N))
  173. sys.stdout.write("mean_abs=%.15g\n" % (sum3 / N))
  174. sys.stdout.write("variance=%.15g\n" % ((sum2 - sum * sum / N) / N))
  175. sys.stdout.write(
  176. "stddev=%.15g\n" %
  177. (math.sqrt(
  178. (sum2 - sum * sum / N) / N)))
  179. sys.stdout.write(
  180. "coeff_var=%.15g\n" %
  181. ((math.sqrt((sum2 - sum * sum / N) / N)) /
  182. (math.sqrt(sum * sum) / N)))
  183. sys.stdout.write("sum=%.15g\n" % sum)
  184. if not extend:
  185. return
  186. # preparations:
  187. sortfile(tmp, tmp + ".sort")
  188. odd = N % 2
  189. eostr = ['even', 'odd'][odd]
  190. q25pos = round(N * 0.25)
  191. if q25pos == 0:
  192. q25pos = 1
  193. q50apos = round(N * 0.50)
  194. if q50apos == 0:
  195. q50apos = 1
  196. q50bpos = q50apos + (1 - odd)
  197. q75pos = round(N * 0.75)
  198. if q75pos == 0:
  199. q75pos = 1
  200. ppos = {}
  201. pval = {}
  202. for i in range(len(perc)):
  203. ppos[i] = round(N * perc[i] / 100)
  204. if ppos[i] == 0:
  205. ppos[i] = 1
  206. pval[i] = 0
  207. inf = open(tmp + ".sort")
  208. l = 1
  209. for line in inf:
  210. if l == q25pos:
  211. q25 = float(line.rstrip('\r\n'))
  212. if l == q50apos:
  213. q50a = float(line.rstrip('\r\n'))
  214. if l == q50bpos:
  215. q50b = float(line.rstrip('\r\n'))
  216. if l == q75pos:
  217. q75 = float(line.rstrip('\r\n'))
  218. for i in range(len(ppos)):
  219. if l == ppos[i]:
  220. pval[i] = float(line.rstrip('\r\n'))
  221. l += 1
  222. q50 = (q50a + q50b) / 2
  223. if not shellstyle:
  224. sys.stdout.write("1st Quartile: %.15g\n" % q25)
  225. sys.stdout.write("Median (%s N): %.15g\n" % (eostr, q50))
  226. sys.stdout.write("3rd Quartile: %.15g\n" % q75)
  227. for i in range(len(perc)):
  228. if perc[i] == int(perc[i]): # integer
  229. if int(perc[i]) % 10 == 1 and int(perc[i]) != 11:
  230. sys.stdout.write(
  231. "%dst Percentile: %.15g\n" %
  232. (int(
  233. perc[i]),
  234. pval[i]))
  235. elif int(perc[i]) % 10 == 2 and int(perc[i]) != 12:
  236. sys.stdout.write(
  237. "%dnd Percentile: %.15g\n" %
  238. (int(
  239. perc[i]),
  240. pval[i]))
  241. elif int(perc[i]) % 10 == 3 and int(perc[i]) != 13:
  242. sys.stdout.write(
  243. "%drd Percentile: %.15g\n" %
  244. (int(
  245. perc[i]),
  246. pval[i]))
  247. else:
  248. sys.stdout.write(
  249. "%dth Percentile: %.15g\n" %
  250. (int(
  251. perc[i]),
  252. pval[i]))
  253. else:
  254. sys.stdout.write(
  255. "%.15g Percentile: %.15g\n" %
  256. (perc[i], pval[i]))
  257. else:
  258. sys.stdout.write("first_quartile=%.15g\n" % q25)
  259. sys.stdout.write("median=%.15g\n" % q50)
  260. sys.stdout.write("third_quartile=%.15g\n" % q75)
  261. for i in range(len(perc)):
  262. percstr = "%.15g" % perc[i]
  263. percstr = percstr.replace('.', '_')
  264. sys.stdout.write("percentile_%s=%.15g\n" % (percstr, pval[i]))
  265. if __name__ == "__main__":
  266. options, flags = gscript.parser()
  267. atexit.register(cleanup)
  268. main()