v.db.univar.py 7.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267
  1. #!/usr/bin/env python
  2. ############################################################################
  3. #
  4. # MODULE: v.db.univar (formerly called v.univar.sh)
  5. # AUTHOR(S): Michael Barton, Arizona State University
  6. # Converted to Python by Glynn Clements
  7. # Sync'ed to r.univar by Markus Metz
  8. # PURPOSE: Calculates univariate statistics from a GRASS vector map attribute column.
  9. # Based on r.univar.sh by Markus Neteler
  10. # COPYRIGHT: (C) 2005, 2007, 2008 by the GRASS Development Team
  11. #
  12. # This program is free software under the GNU General Public
  13. # License (>=v2). Read the file COPYING that comes with GRASS
  14. # for details.
  15. #
  16. #############################################################################
  17. #%Module
  18. #% description: Calculates univariate statistics on selected table column for a GRASS vector map.
  19. #% keywords: vector
  20. #% keywords: statistics
  21. #%End
  22. #%flag
  23. #% key: e
  24. #% description: Extended statistics (quartiles and 90th percentile)
  25. #%END
  26. #%flag
  27. #% key: g
  28. #% description: Print stats in shell script style
  29. #%END
  30. #%option
  31. #% key: table
  32. #% type: string
  33. #% gisprompt: old_dbtable,dbtable,dbtable
  34. #% description: Name of data table
  35. #% required : yes
  36. #%End
  37. #%option
  38. #% key: column
  39. #% type: string
  40. #% gisprompt: old_dbcolumn,dbcolumn,dbcolumn
  41. #% description: Column on which to calculate statistics (must be numeric)
  42. #% required : yes
  43. #%end
  44. #%option
  45. #% key: database
  46. #% type: string
  47. #% gisprompt: old_dbname,dbname,dbname
  48. #% answer: DEFAULT_DBNAME
  49. #% description: Database/directory for table
  50. #% required : no
  51. #%end
  52. #%option
  53. #% key: driver
  54. #% type: string
  55. #% gisprompt: old_dbdriver,dbdriver,dbdriver
  56. #% options: dbf,odbc,ogr,sqlite
  57. #% description: Database driver
  58. #% required : no
  59. #%end
  60. #%option
  61. #% key: where
  62. #% type: string
  63. #% description: WHERE conditions of SQL statement without 'where' keyword
  64. #% required : no
  65. #%end
  66. #%option
  67. #% key: percentile
  68. #% type: double
  69. #% description: Percentile to calculate (requires extended statistics flag)
  70. #% required : no
  71. #% answer: 90
  72. #% options: 0-100
  73. #% multiple: yes
  74. #%end
  75. import sys
  76. import os
  77. import atexit
  78. import math
  79. from grass.script import core as grass
  80. def cleanup():
  81. for ext in ['', '.sort']:
  82. grass.try_remove(tmp + ext)
  83. def sortfile(infile, outfile):
  84. inf = file(infile, 'r')
  85. outf = file(outfile, 'w')
  86. if grass.find_program('sort', ['-n']):
  87. grass.run_command('sort', flags = 'n', stdin = inf, stdout = outf)
  88. else:
  89. # FIXME: we need a large-file sorting function
  90. grass.warning(_("'sort' not found: sorting in memory"))
  91. lines = inf.readlines()
  92. for i in range(len(lines)):
  93. lines[i] = float(lines[i].rstrip('\r\n'))
  94. lines.sort()
  95. for line in lines:
  96. outf.write(str(line) + '\n')
  97. inf.close()
  98. outf.close()
  99. def main():
  100. global tmp
  101. tmp = grass.tempfile()
  102. extend = flags['e']
  103. shellstyle = flags['g']
  104. table = options['table']
  105. column = options['column']
  106. database = options['database']
  107. driver = options['driver']
  108. where = options['where']
  109. perc = options['percentile']
  110. perc = [float(p) for p in perc.split(',')]
  111. if not shellstyle:
  112. grass.message(_("Calculation for column <%s> of table <%s>...") % (column, table))
  113. grass.message(_("Reading column values..."))
  114. sql = "SELECT %s FROM %s" % (column, table)
  115. if where:
  116. sql += " WHERE " + where
  117. if not database:
  118. database = None
  119. if not driver:
  120. driver = None
  121. tmpf = file(tmp, 'w')
  122. grass.run_command('db.select', flags = 'c', table = table,
  123. database = database, driver = driver, sql = sql,
  124. stdout = tmpf)
  125. tmpf.close()
  126. # check if result is empty
  127. tmpf = file(tmp)
  128. if tmpf.read(1) == '':
  129. grass.fatal(_("Table <%s> contains no data.") % table)
  130. tmpf.close()
  131. # calculate statistics
  132. if not shellstyle:
  133. grass.message(_("Calculating statistics..."))
  134. N = 0
  135. sum = 0.0
  136. sum2 = 0.0
  137. sum3 = 0.0
  138. minv = 1e300
  139. maxv = -1e300
  140. tmpf = file(tmp)
  141. for line in tmpf:
  142. x = float(line.rstrip('\r\n'))
  143. N += 1
  144. sum += x
  145. sum2 += x * x
  146. sum3 += abs(x)
  147. maxv = max(maxv, x)
  148. minv = min(minv, x)
  149. tmpf.close()
  150. if N <= 0:
  151. grass.fatal(_("No non-null values found"))
  152. if not shellstyle:
  153. print ""
  154. print "Number of values: %d" % N
  155. print "Minimum: %g" % minv
  156. print "Maximum: %g" % maxv
  157. print "Range: %g" % (maxv - minv)
  158. print "-----"
  159. print "Mean: %g" % (sum/N)
  160. print "Arithmetic mean of absolute values: %g" % (sum3/N)
  161. print "Variance: %g" % ((sum2 - sum*sum/N)/N)
  162. print "Standard deviation: %g" % (math.sqrt((sum2 - sum*sum/N)/N))
  163. print "Coefficient of variation: %g" % ((math.sqrt((sum2 - sum*sum/N)/N))/(math.sqrt(sum*sum)/N))
  164. print "Sum: %g" % sum
  165. print "-----"
  166. else:
  167. print "n=%d" % N
  168. print "min=%.15g" % minv
  169. print "max=%.15g" % maxv
  170. print "range=%.15g" % (maxv - minv)
  171. print "mean=%.15g" % (sum/N)
  172. print "mean_abs=%.15g" % (sum3/N)
  173. print "variance=%.15g" % ((sum2 - sum*sum/N)/N)
  174. print "stddev=%.15g" % (math.sqrt((sum2 - sum*sum/N)/N))
  175. print "coeff_var=%.15g" % ((math.sqrt((sum2 - sum*sum/N)/N))/(math.sqrt(sum*sum)/N))
  176. print "sum=%.15g" % sum
  177. if not extend:
  178. return
  179. # preparations:
  180. sortfile(tmp, tmp + ".sort")
  181. number = N
  182. odd = N % 2
  183. eostr = ['even','odd'][odd]
  184. q25pos = round(N * 0.25)
  185. q50apos = round(N * 0.50)
  186. q50bpos = q50apos + (1 - odd)
  187. q75pos = round(N * 0.75)
  188. ppos = {}
  189. pval = {}
  190. for i in range(len(perc)):
  191. ppos[i] = round(N * perc[i] / 100)
  192. pval[i] = 0
  193. inf = file(tmp + ".sort")
  194. l = 1
  195. for line in inf:
  196. if l == q25pos:
  197. q25 = float(line.rstrip('\r\n'))
  198. if l == q50apos:
  199. q50a = float(line.rstrip('\r\n'))
  200. if l == q50bpos:
  201. q50b = float(line.rstrip('\r\n'))
  202. if l == q75pos:
  203. q75 = float(line.rstrip('\r\n'))
  204. for i in range(len(ppos)):
  205. if l == ppos[i]:
  206. pval[i] = float(line.rstrip('\r\n'))
  207. l += 1
  208. q50 = (q50a + q50b) / 2
  209. if not shellstyle:
  210. print "1st Quartile: %g" % q25
  211. print "Median (%s N): %g" % (eostr, q50)
  212. print "3rd Quartile: %g" % q75
  213. for i in range(len(perc)):
  214. if perc[i] == int(perc[i]): # integer
  215. if int(perc[i]) % 10 == 1 and int(perc[i]) != 11:
  216. print "%dst Percentile: %g" % (int(perc[i]), pval[i])
  217. elif int(perc[i]) % 10 == 2 and int(perc[i]) != 12:
  218. print "%dnd Percentile: %g" % (int(perc[i]), pval[i])
  219. elif int(perc[i]) % 10 == 3 and int(perc[i]) != 13:
  220. print "%drd Percentile: %g" % (int(perc[i]), pval[i])
  221. else:
  222. print "%dth Percentile: %g" % (int(perc[i]), pval[i])
  223. else:
  224. print "%.15g Percentile: %g" % (perc[i], pval[i])
  225. else:
  226. print "first_quartile=%g" % q25
  227. print "median=%g" % q50
  228. print "third_quartile=%g" % q75
  229. for i in range(len(perc)):
  230. percstr = "%.15g" % perc[i]
  231. percstr = percstr.replace('.','_')
  232. print "percentile_%s=%g" % (percstr, pval[i])
  233. if __name__ == "__main__":
  234. options, flags = grass.parser()
  235. atexit.register(cleanup)
  236. main()