class.c 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449
  1. /* functions to classify sorted arrays of doubles and fill a vector of classbreaks */
  2. #include <grass/glocale.h>
  3. #include <grass/arraystats.h>
  4. double class_apply_algorithm(char *algo, double *data, int nrec, int *nbreaks,
  5. double *classbreaks)
  6. {
  7. double finfo = 0.0;
  8. if (G_strcasecmp(algo, "int") == 0)
  9. finfo = class_interval(data, nrec, *nbreaks, classbreaks);
  10. else if (G_strcasecmp(algo, "std") == 0)
  11. finfo = class_stdev(data, nrec, *nbreaks, classbreaks);
  12. else if (G_strcasecmp(algo, "qua") == 0)
  13. finfo = class_quant(data, nrec, *nbreaks, classbreaks);
  14. else if (G_strcasecmp(algo, "equ") == 0)
  15. finfo = class_equiprob(data, nrec, nbreaks, classbreaks);
  16. else if (G_strcasecmp(algo, "dis") == 0)
  17. /* finfo = class_discont(data, nrec, *nbreaks, classbreaks); disabled because of bugs */
  18. G_fatal_error(_("Discont algorithm currently not available because of bugs"));
  19. else
  20. G_fatal_error(_("%s: Unknown algorithm"), algo);
  21. if (finfo == 0)
  22. G_fatal_error(_("%s: Error in classification algorithm"), algo);
  23. return finfo;
  24. }
  25. int class_interval(double *data, int count, int nbreaks, double *classbreaks)
  26. {
  27. double min, max;
  28. double step;
  29. int i = 0;
  30. min = data[0];
  31. max = data[count - 1];
  32. step = (max - min) / (nbreaks + 1);
  33. for (i = 0; i < nbreaks; i++)
  34. classbreaks[i] = min + (step * (i + 1));
  35. return (1);
  36. }
  37. double class_stdev(double *data, int count, int nbreaks, double *classbreaks)
  38. {
  39. struct GASTATS stats;
  40. int i;
  41. int nbclass;
  42. double scale = 1.0;
  43. basic_stats(data, count, &stats);
  44. nbclass = nbreaks + 1;
  45. if (nbclass % 2 == 1) { /* number of classes is uneven so we center middle class on mean */
  46. /* find appropriate fraction of stdev for step */
  47. i = 1;
  48. while (i) {
  49. if (((stats.mean + stats.stdev * scale / 2) +
  50. (stats.stdev * scale * (nbclass / 2 - 1)) > stats.max) ||
  51. ((stats.mean - stats.stdev * scale / 2) -
  52. (stats.stdev * scale * (nbclass / 2 - 1)) < stats.min))
  53. scale = scale / 2;
  54. else
  55. i = 0;
  56. }
  57. /* classbreaks below the mean */
  58. for (i = 0; i < nbreaks / 2; i++)
  59. classbreaks[i] =
  60. (stats.mean - stats.stdev * scale / 2) -
  61. stats.stdev * scale * (nbreaks / 2 - (i + 1));
  62. /* classbreaks above the mean */
  63. for (i = i; i < nbreaks; i++)
  64. classbreaks[i] =
  65. (stats.mean + stats.stdev * scale / 2) +
  66. stats.stdev * scale * (i - nbreaks / 2);
  67. }
  68. else { /* number of classes is even so mean is a classbreak */
  69. /* decide whether to use 1*stdev or 0.5*stdev as step */
  70. i = 1;
  71. while (i) {
  72. if (((stats.mean) + (stats.stdev * scale * (nbclass / 2 - 1)) >
  73. stats.max) ||
  74. ((stats.mean) - (stats.stdev * scale * (nbclass / 2 - 1)) <
  75. stats.min))
  76. scale = scale / 2;
  77. else
  78. i = 0;
  79. }
  80. /* classbreaks below the mean and on the mean */
  81. for (i = 0; i <= nbreaks / 2; i++)
  82. classbreaks[i] =
  83. stats.mean - stats.stdev * scale * (nbreaks / 2 - i);
  84. /* classbreaks above the mean */
  85. for (i = i; i < nbreaks; i++)
  86. classbreaks[i] =
  87. stats.mean + stats.stdev * scale * (i - nbreaks / 2);
  88. }
  89. return (scale);
  90. }
  91. int class_quant(double *data, int count, int nbreaks, double *classbreaks)
  92. {
  93. int i, step;
  94. step = count / (nbreaks + 1);
  95. for (i = 0; i < nbreaks; i++)
  96. classbreaks[i] = data[step * (i + 1)];
  97. return (1);
  98. }
  99. int class_equiprob(double *data, int count, int *nbreaks, double *classbreaks)
  100. {
  101. int i, j;
  102. double *lequi; /*Vector of scale factors for probabilities of the normal distribution */
  103. struct GASTATS stats;
  104. int nbclass;
  105. nbclass = *nbreaks + 1;
  106. lequi = G_malloc(*nbreaks * sizeof(double));
  107. /* The following values come from the normal distribution and will be used as:
  108. * classbreak[i] = (lequi[i] * stdev) + mean;
  109. */
  110. if (nbclass < 3) {
  111. lequi[0] = 0;
  112. }
  113. else if (nbclass == 3) {
  114. lequi[0] = -0.43076;
  115. lequi[1] = 0.43076;
  116. }
  117. else if (nbclass == 4) {
  118. lequi[0] = -0.6745;
  119. lequi[1] = 0;
  120. lequi[2] = 0.6745;
  121. }
  122. else if (nbclass == 5) {
  123. lequi[0] = -0.8416;
  124. lequi[1] = -0.2533;
  125. lequi[2] = 0.2533;
  126. lequi[3] = 0.8416;
  127. }
  128. else if (nbclass == 6) {
  129. lequi[0] = -0.9676;
  130. lequi[1] = -0.43076;
  131. lequi[2] = 0;
  132. lequi[3] = 0.43076;
  133. lequi[4] = 0.9676;
  134. }
  135. else if (nbclass == 7) {
  136. lequi[0] = -1.068;
  137. lequi[1] = -0.566;
  138. lequi[2] = -0.18;
  139. lequi[3] = 0.18;
  140. lequi[4] = 0.566;
  141. lequi[5] = 1.068;
  142. }
  143. else if (nbclass == 8) {
  144. lequi[0] = -1.1507;
  145. lequi[1] = -0.6745;
  146. lequi[2] = -0.3187;
  147. lequi[3] = 0;
  148. lequi[4] = 0.3187;
  149. lequi[5] = 0.6745;
  150. lequi[6] = 1.1507;
  151. }
  152. else if (nbclass == 9) {
  153. lequi[0] = -1.2208;
  154. lequi[1] = -0.7648;
  155. lequi[2] = -0.4385;
  156. lequi[3] = -0.1397;
  157. lequi[4] = 0.1397;
  158. lequi[5] = 0.4385;
  159. lequi[6] = 0.7648;
  160. lequi[7] = 1.2208;
  161. }
  162. else if (nbclass == 10) {
  163. lequi[0] = -1.28155;
  164. lequi[1] = -0.84162;
  165. lequi[2] = -0.5244;
  166. lequi[3] = -0.25335;
  167. lequi[4] = 0;
  168. lequi[5] = 0.25335;
  169. lequi[6] = 0.5244;
  170. lequi[7] = 0.84162;
  171. lequi[8] = 1.28155;
  172. }
  173. else {
  174. G_fatal_error
  175. ("Equiprobable classbreaks currently limited to 10 classes");
  176. }
  177. basic_stats(data, count, &stats);
  178. /* Check if any of the classbreaks would fall outside of the range min-max */
  179. j = 0;
  180. for (i = 0; i < *nbreaks; i++) {
  181. if ((lequi[i] * stats.stdev + stats.mean) >= stats.min &&
  182. (lequi[i] * stats.stdev) + stats.mean <= stats.max) {
  183. j++;
  184. }
  185. }
  186. if (j < (*nbreaks)) {
  187. G_warning(_("There are classbreaks outside the range min-max. Number of "
  188. "classes reduced to %i, but using probabilities for %i classes."),
  189. j + 1, *nbreaks + 1);
  190. G_realloc(classbreaks, j * sizeof(double));
  191. for (i = 0; i < j; i++)
  192. classbreaks[i] = 0;
  193. }
  194. j = 0;
  195. for (i = 0; i < *nbreaks; i++) {
  196. if ((lequi[i] * stats.stdev + stats.mean) >= stats.min &&
  197. (lequi[i] * stats.stdev) + stats.mean <= stats.max) {
  198. classbreaks[j] = lequi[i] * stats.stdev + stats.mean;
  199. j++;
  200. }
  201. }
  202. *nbreaks = j;
  203. G_free(lequi);
  204. return (1);
  205. }
  206. /* FIXME: there seems to a problem with array overflow, probably due to
  207. the fact that the code was ported from fortran which has 1-based arrays*/
  208. double class_discont(double *data, int count, int nbreaks,
  209. double *classbreaks)
  210. {
  211. int *num, nbclass;
  212. double *no, *zz, *nz, *xn, *co;
  213. double *x; /* Vector standardized observations */
  214. int i, j, k;
  215. double min = 0, max = 0, rangemax = 0;
  216. int n = 0;
  217. double rangemin = 0, xlim = 0;
  218. double dmax = 0.0, d2 = 0.0, dd = 0.0, p = 0.0;
  219. int nf = 0, nmax = 0;
  220. double *abc;
  221. int nd = 0;
  222. double den = 0, d = 0;
  223. int im = 0, ji = 0;
  224. int tmp = 0;
  225. int nff = 0, jj = 0, no1 = 0, no2 = 0;
  226. double f = 0, xt1 = 0, xt2 = 0, chi2 = 1000.0, xnj_1 = 0, xj_1 = 0;
  227. /*get the number of values */
  228. n = count;
  229. nbclass = nbreaks + 1;
  230. num = G_malloc((nbclass + 1) * sizeof(int));
  231. no = G_malloc((nbclass + 1) * sizeof(double));
  232. zz = G_malloc((nbclass + 1) * sizeof(double));
  233. nz = G_malloc(3 * sizeof(double));
  234. xn = G_malloc((n + 1) * sizeof(double));
  235. co = G_malloc((nbclass + 1) * sizeof(double));
  236. /* We copy the array of values to x, in order to be able to standardize it */
  237. x = G_malloc((n + 1) * sizeof(double));
  238. x[0] = n;
  239. xn[0] = 0;
  240. min = data[0];
  241. max = data[count - 1];
  242. for (i = 1; i <= n; i++)
  243. x[i] = data[i - 1];
  244. rangemax = max - min;
  245. rangemin = rangemax;
  246. for (i = 2; i <= n; i++) {
  247. if (x[i] != x[i - 1] && x[i] - x[i - 1] < rangemin)
  248. rangemin = x[i] - x[i - 1]; /* rangemin = minimal distance */
  249. }
  250. /* STANDARDIZATION
  251. * and creation of the number vector (xn) */
  252. for (i = 1; i <= n; i++) {
  253. x[i] = (x[i] - min) / rangemax;
  254. xn[i] = i / (double)n;
  255. }
  256. xlim = rangemin / rangemax;
  257. rangemin = rangemin / 2.0;
  258. /* Searching for the limits */
  259. num[1] = n;
  260. abc = G_malloc(3 * sizeof(double));
  261. /* Loop through possible solutions */
  262. for (i = 1; i <= nbclass; i++) {
  263. nmax = 0;
  264. dmax = 0.0;
  265. d2 = 0.0;
  266. nf = 0; /*End number */
  267. /* Loop through classes */
  268. for (j = 1; j <= i; j++) {
  269. nd = nf; /*Start number */
  270. nf = num[j];
  271. co[j] = 10e37;
  272. eqdrt(x, xn, nd, nf, abc);
  273. den = sqrt(pow(abc[1], 2) + 1.0);
  274. nd++;
  275. /* Loop through observations */
  276. for (k = nd; k <= nf; k++) {
  277. if (abc[2] == 0.0)
  278. d = fabs((-1.0 * abc[1] * x[k]) + xn[k] - abc[0]) / den;
  279. else
  280. d = fabs(x[k] - abc[2]);
  281. d2 += pow(d, 2);
  282. if (x[k] - x[nd] < xlim)
  283. continue;
  284. if (x[nf] - x[k] < xlim)
  285. continue;
  286. if (d <= dmax)
  287. continue;
  288. dmax = d;
  289. nmax = k;
  290. }
  291. nd--; /* A VERIFIER! */
  292. if (x[nf] != x[nd]) {
  293. if (nd != 0)
  294. co[j] = (xn[nf] - xn[nd]) / (x[nf] - x[nd]);
  295. else
  296. co[j] = (xn[nf]) / (x[nf]); /* A VERIFIER! */
  297. }
  298. }
  299. if (i == 1)
  300. dd = d2;
  301. p = d2 / dd;
  302. for (j = 1; j <= i; j++) {
  303. no[j] = num[j];
  304. zz[j] = x[num[j]] * rangemax + min;
  305. if (j == i)
  306. continue;
  307. if (co[j] > co[j + 1]) {
  308. zz[j] = zz[j] + rangemin;
  309. continue;
  310. }
  311. zz[j] = zz[j] - rangemin;
  312. no[j] = no[j] - 1;
  313. }
  314. im = i - 1;
  315. if (im != 0.0) {
  316. for (j = 1; j <= im; j++) {
  317. ji = i + 1 - j;
  318. no[ji] -= no[ji - 1];
  319. }
  320. }
  321. if (nmax == 0) {
  322. break;
  323. }
  324. nff = i + 2;
  325. tmp = 0;
  326. for (j = 1; j <= i; j++) {
  327. jj = nff - j;
  328. if (num[jj - 1] < nmax) {
  329. num[jj] = nmax;
  330. tmp = 1;
  331. break;
  332. }
  333. num[jj] = num[jj - 1];
  334. }
  335. if (tmp == 0) {
  336. num[1] = nmax;
  337. jj = 1;
  338. }
  339. if (jj == 1) {
  340. xnj_1 = 0;
  341. xj_1 = 0;
  342. }
  343. else {
  344. xnj_1 = xn[num[jj - 1]];
  345. xj_1 = x[num[jj - 1]];
  346. }
  347. no1 = (xn[num[jj]] - xnj_1) * n;
  348. no2 = (xn[num[jj + 1]] - xn[num[jj]]) * n;
  349. f = (xn[num[jj + 1]] - xnj_1) / (x[num[jj + 1]] - xj_1);
  350. f *= n;
  351. xt1 = (x[num[jj]] - xj_1) * f;
  352. xt2 = (x[num[jj + 1]] - x[num[jj]]) * f;
  353. if (xt2 == 0) {
  354. xt2 = rangemin / 2.0 / rangemax * f;
  355. xt1 -= xt2;
  356. }
  357. else if (xt1 * xt2 == 0) {
  358. xt1 = rangemin / 2.0 / rangemax * f;
  359. xt2 -= xt1;
  360. }
  361. /* calculate chi-square to indicate statistical significance of new class, i.e. how probable would it be that the new class could be the result of purely random choice */
  362. if (chi2 > pow((double)((no1 - no2) - (xt1 - xt2)), 2) / (xt1 + xt2))
  363. chi2 = pow((double)((no1 - no2) - (xt1 - xt2)), 2) / (xt1 + xt2);
  364. }
  365. /* Fill up classbreaks of i <=nbclass classes */
  366. for (j = 0; j <= (i - 1); j++)
  367. classbreaks[j] = zz[j + 1];
  368. return (chi2);
  369. }
  370. int class_frequencies(double *data, int count, int nbreaks,
  371. double *classbreaks, int *frequencies)
  372. {
  373. int i, j;
  374. double min, max;
  375. min = data[0];
  376. max = data[count - 1];
  377. /* count cases in all classes, except for last class */
  378. i = 0;
  379. for (j = 0; j < nbreaks; j++) {
  380. while (data[i] <= classbreaks[j]) {
  381. frequencies[j]++;
  382. i++;
  383. }
  384. }
  385. /*Now count cases in last class */
  386. for (i = i; i < count; i++) {
  387. frequencies[nbreaks]++;
  388. }
  389. return (1);
  390. }