class.c 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448
  1. /* functions to classify sorted arrays of doubles and fill a vector of classbreaks */
  2. #include <grass/glocale.h>
  3. #include <grass/arraystats.h>
  4. double class_apply_algorithm(char *algo, double *data, int nrec, int *nbreaks,
  5. double *classbreaks)
  6. {
  7. double finfo = 0.0;
  8. if (G_strcasecmp(algo, "int") == 0)
  9. finfo = class_interval(data, nrec, *nbreaks, classbreaks);
  10. else if (G_strcasecmp(algo, "std") == 0)
  11. finfo = class_stdev(data, nrec, *nbreaks, classbreaks);
  12. else if (G_strcasecmp(algo, "qua") == 0)
  13. finfo = class_quant(data, nrec, *nbreaks, classbreaks);
  14. else if (G_strcasecmp(algo, "equ") == 0)
  15. finfo = class_equiprob(data, nrec, nbreaks, classbreaks);
  16. else if (G_strcasecmp(algo, "dis") == 0)
  17. /* finfo = class_discont(data, nrec, *nbreaks, classbreaks); disabled because of bugs */
  18. G_fatal_error(_("Discont algorithm currently not available because of bugs"));
  19. else
  20. G_fatal_error(_("%s: Unknown algorithm"), algo);
  21. if (finfo == 0)
  22. G_fatal_error(_("%s: Error in classification algorithm"), algo);
  23. return finfo;
  24. }
  25. int class_interval(double *data, int count, int nbreaks, double *classbreaks)
  26. {
  27. double min, max;
  28. double step;
  29. int i = 0;
  30. min = data[0];
  31. max = data[count - 1];
  32. step = (max - min) / (nbreaks + 1);
  33. for (i = 0; i < nbreaks; i++)
  34. classbreaks[i] = min + (step * (i + 1));
  35. return (1);
  36. }
  37. double class_stdev(double *data, int count, int nbreaks, double *classbreaks)
  38. {
  39. struct GASTATS stats;
  40. int i;
  41. int nbclass;
  42. double scale = 1.0;
  43. basic_stats(data, count, &stats);
  44. nbclass = nbreaks + 1;
  45. if (nbclass % 2 == 1) { /* number of classes is uneven so we center middle class on mean */
  46. /* find appropriate fraction of stdev for step */
  47. i = 1;
  48. while (i) {
  49. if (((stats.mean + stats.stdev * scale / 2) +
  50. (stats.stdev * scale * (nbclass / 2 - 1)) > stats.max) ||
  51. ((stats.mean - stats.stdev * scale / 2) -
  52. (stats.stdev * scale * (nbclass / 2 - 1)) < stats.min))
  53. scale = scale / 2;
  54. else
  55. i = 0;
  56. }
  57. /* classbreaks below the mean */
  58. for (i = 0; i < nbreaks / 2; i++)
  59. classbreaks[i] =
  60. (stats.mean - stats.stdev * scale / 2) -
  61. stats.stdev * scale * (nbreaks / 2 - (i + 1));
  62. /* classbreaks above the mean */
  63. for (i = i; i < nbreaks; i++)
  64. classbreaks[i] =
  65. (stats.mean + stats.stdev * scale / 2) +
  66. stats.stdev * scale * (i - nbreaks / 2);
  67. }
  68. else { /* number of classes is even so mean is a classbreak */
  69. /* decide whether to use 1*stdev or 0.5*stdev as step */
  70. i = 1;
  71. while (i) {
  72. if (((stats.mean) + (stats.stdev * scale * (nbclass / 2 - 1)) >
  73. stats.max) ||
  74. ((stats.mean) - (stats.stdev * scale * (nbclass / 2 - 1)) <
  75. stats.min))
  76. scale = scale / 2;
  77. else
  78. i = 0;
  79. }
  80. /* classbreaks below the mean and on the mean */
  81. for (i = 0; i <= nbreaks / 2; i++)
  82. classbreaks[i] =
  83. stats.mean - stats.stdev * scale * (nbreaks / 2 - i);
  84. /* classbreaks above the mean */
  85. for (i = i; i < nbreaks; i++)
  86. classbreaks[i] =
  87. stats.mean + stats.stdev * scale * (i - nbreaks / 2);
  88. }
  89. return (scale);
  90. }
  91. int class_quant(double *data, int count, int nbreaks, double *classbreaks)
  92. {
  93. int i, step;
  94. step = count / (nbreaks + 1);
  95. for (i = 0; i < nbreaks; i++)
  96. classbreaks[i] = data[step * (i + 1)];
  97. return (1);
  98. }
  99. int class_equiprob(double *data, int count, int *nbreaks, double *classbreaks)
  100. {
  101. int i, j;
  102. double *lequi; /*Vector of scale factors for probabilities of the normal distribution */
  103. struct GASTATS stats;
  104. int nbclass;
  105. nbclass = *nbreaks + 1;
  106. lequi = G_malloc(*nbreaks * sizeof(double));
  107. /*The following values come from the normal distribution and
  108. * will be used as:
  109. * classbreak[i] = (lequi[i] * stdev) + mean;
  110. */
  111. if (nbclass < 3) {
  112. lequi[0] = 0;
  113. }
  114. else if (nbclass == 3) {
  115. lequi[0] = -0.43076;
  116. lequi[1] = 0.43076;
  117. }
  118. else if (nbclass == 4) {
  119. lequi[0] = -0.6745;
  120. lequi[1] = 0;
  121. lequi[2] = 0.6745;
  122. }
  123. else if (nbclass == 5) {
  124. lequi[0] = -0.8416;
  125. lequi[1] = -0.2533;
  126. lequi[2] = 0.2533;
  127. lequi[3] = 0.8416;
  128. }
  129. else if (nbclass == 6) {
  130. lequi[0] = -0.9676;
  131. lequi[1] = -0.43076;
  132. lequi[2] = 0;
  133. lequi[3] = 0.43076;
  134. lequi[4] = 0.9676;
  135. }
  136. else if (nbclass == 7) {
  137. lequi[0] = -1.068;
  138. lequi[1] = -0.566;
  139. lequi[2] = -0.18;
  140. lequi[3] = 0.18;
  141. lequi[4] = 0.566;
  142. lequi[5] = 1.068;
  143. }
  144. else if (nbclass == 8) {
  145. lequi[0] = -1.1507;
  146. lequi[1] = -0.6745;
  147. lequi[2] = -0.3187;
  148. lequi[3] = 0;
  149. lequi[4] = 0.3187;
  150. lequi[5] = 0.6745;
  151. lequi[6] = 1.1507;
  152. }
  153. else if (nbclass == 9) {
  154. lequi[0] = -1.2208;
  155. lequi[1] = -0.7648;
  156. lequi[2] = -0.4385;
  157. lequi[3] = -0.1397;
  158. lequi[4] = 0.1397;
  159. lequi[5] = 0.4385;
  160. lequi[6] = 0.7648;
  161. lequi[7] = 1.2208;
  162. }
  163. else if (nbclass == 10) {
  164. lequi[0] = -1.28155;
  165. lequi[1] = -0.84162;
  166. lequi[2] = -0.5244;
  167. lequi[3] = -0.25335;
  168. lequi[4] = 0;
  169. lequi[5] = 0.25335;
  170. lequi[6] = 0.5244;
  171. lequi[7] = 0.84162;
  172. lequi[8] = 1.28155;
  173. }
  174. else {
  175. G_fatal_error
  176. ("Equiprobable classbreaks currently limited to 10 classes");
  177. }
  178. basic_stats(data, count, &stats);
  179. /*check if any of the classbreaks would fall outside of the range min-max */
  180. j = 0;
  181. for (i = 0; i < *nbreaks; i++) {
  182. if ((lequi[i] * stats.stdev + stats.mean) >= stats.min &&
  183. (lequi[i] * stats.stdev) + stats.mean <= stats.max) {
  184. j++;
  185. }
  186. }
  187. if (j < (*nbreaks)) {
  188. G_warning(_("There are classbreaks outside the range min-max. Number of classes reduced to %i, but using probabilities for %i classes."),
  189. j + 1, *nbreaks + 1);
  190. G_realloc(classbreaks, j * sizeof(double));
  191. for (i = 0; i < j; i++)
  192. classbreaks[i] = 0;
  193. }
  194. j = 0;
  195. for (i = 0; i < *nbreaks; i++) {
  196. if ((lequi[i] * stats.stdev + stats.mean) >= stats.min &&
  197. (lequi[i] * stats.stdev) + stats.mean <= stats.max) {
  198. classbreaks[j] = lequi[i] * stats.stdev + stats.mean;
  199. j++;
  200. }
  201. }
  202. *nbreaks = j;
  203. G_free(lequi);
  204. return (1);
  205. }
  206. /* FixMe: there seems to a problem with array overflow, probably due to the fact that the code was ported from fortran which has 1-based arrays*/
  207. double class_discont(double *data, int count, int nbreaks,
  208. double *classbreaks)
  209. {
  210. int *num, nbclass;
  211. double *no, *zz, *nz, *xn, *co;
  212. double *x; /* Vecteur des observations standardisées */
  213. int i, j, k;
  214. double min = 0, max = 0, rangemax = 0;
  215. int n = 0;
  216. double rangemin = 0, xlim = 0;
  217. double dmax = 0.0, d2 = 0.0, dd = 0.0, p = 0.0;
  218. int nf = 0, nmax = 0;
  219. double *abc;
  220. int nd = 0;
  221. double den = 0, d = 0;
  222. int im = 0, ji = 0;
  223. int tmp = 0;
  224. int nff = 0, jj = 0, no1 = 0, no2 = 0;
  225. double f = 0, xt1 = 0, xt2 = 0, chi2 = 1000.0, xnj_1 = 0, xj_1 = 0;
  226. /*get the number of values */
  227. n = count;
  228. nbclass = nbreaks + 1;
  229. num = G_malloc((nbclass + 1) * sizeof(int));
  230. no = G_malloc((nbclass + 1) * sizeof(double));
  231. zz = G_malloc((nbclass + 1) * sizeof(double));
  232. nz = G_malloc(3 * sizeof(double));
  233. xn = G_malloc((n + 1) * sizeof(double));
  234. co = G_malloc((nbclass + 1) * sizeof(double));
  235. /* We copy the array of values to x, in order to be able to standardize it */
  236. x = G_malloc((n + 1) * sizeof(double));
  237. x[0] = n;
  238. xn[0] = 0;
  239. min = data[0];
  240. max = data[count - 1];
  241. for (i = 1; i <= n; i++)
  242. x[i] = data[i - 1];
  243. rangemax = max - min;
  244. rangemin = rangemax;
  245. for (i = 2; i <= n; i++) {
  246. if (x[i] != x[i - 1] && x[i] - x[i - 1] < rangemin)
  247. rangemin = x[i] - x[i - 1]; /* rangemin = minimal distance */
  248. }
  249. /* STANDARDIZATION
  250. * and creation of the number vector (xn) */
  251. for (i = 1; i <= n; i++) {
  252. x[i] = (x[i] - min) / rangemax;
  253. xn[i] = i / (double)n;
  254. }
  255. xlim = rangemin / rangemax;
  256. rangemin = rangemin / 2.0;
  257. /* Searching for the limits */
  258. num[1] = n;
  259. abc = G_malloc(3 * sizeof(double));
  260. /* Loop through possible solutions */
  261. for (i = 1; i <= nbclass; i++) {
  262. nmax = 0;
  263. dmax = 0.0;
  264. d2 = 0.0;
  265. nf = 0; /*End number */
  266. /* Loop through classes */
  267. for (j = 1; j <= i; j++) {
  268. nd = nf; /*Start number */
  269. nf = num[j];
  270. co[j] = 10e37;
  271. eqdrt(x, xn, nd, nf, abc);
  272. den = sqrt(pow(abc[1], 2) + 1.0);
  273. nd++;
  274. /* Loop through observations */
  275. for (k = nd; k <= nf; k++) {
  276. if (abc[2] == 0.0)
  277. d = fabs((-1.0 * abc[1] * x[k]) + xn[k] - abc[0]) / den;
  278. else
  279. d = fabs(x[k] - abc[2]);
  280. d2 += pow(d, 2);
  281. if (x[k] - x[nd] < xlim)
  282. continue;
  283. if (x[nf] - x[k] < xlim)
  284. continue;
  285. if (d <= dmax)
  286. continue;
  287. dmax = d;
  288. nmax = k;
  289. }
  290. nd--; /* A VERIFIER! */
  291. if (x[nf] != x[nd]) {
  292. if (nd != 0)
  293. co[j] = (xn[nf] - xn[nd]) / (x[nf] - x[nd]);
  294. else
  295. co[j] = (xn[nf]) / (x[nf]); /* A VERIFIER! */
  296. }
  297. }
  298. if (i == 1)
  299. dd = d2;
  300. p = d2 / dd;
  301. for (j = 1; j <= i; j++) {
  302. no[j] = num[j];
  303. zz[j] = x[num[j]] * rangemax + min;
  304. if (j == i)
  305. continue;
  306. if (co[j] > co[j + 1]) {
  307. zz[j] = zz[j] + rangemin;
  308. continue;
  309. }
  310. zz[j] = zz[j] - rangemin;
  311. no[j] = no[j] - 1;
  312. }
  313. im = i - 1;
  314. if (im != 0.0) {
  315. for (j = 1; j <= im; j++) {
  316. ji = i + 1 - j;
  317. no[ji] -= no[ji - 1];
  318. }
  319. }
  320. if (nmax == 0) {
  321. break;
  322. }
  323. nff = i + 2;
  324. tmp = 0;
  325. for (j = 1; j <= i; j++) {
  326. jj = nff - j;
  327. if (num[jj - 1] < nmax) {
  328. num[jj] = nmax;
  329. tmp = 1;
  330. break;
  331. }
  332. num[jj] = num[jj - 1];
  333. }
  334. if (tmp == 0) {
  335. num[1] = nmax;
  336. jj = 1;
  337. }
  338. if (jj == 1) {
  339. xnj_1 = 0;
  340. xj_1 = 0;
  341. }
  342. else {
  343. xnj_1 = xn[num[jj - 1]];
  344. xj_1 = x[num[jj - 1]];
  345. }
  346. no1 = (xn[num[jj]] - xnj_1) * n;
  347. no2 = (xn[num[jj + 1]] - xn[num[jj]]) * n;
  348. f = (xn[num[jj + 1]] - xnj_1) / (x[num[jj + 1]] - xj_1);
  349. f *= n;
  350. xt1 = (x[num[jj]] - xj_1) * f;
  351. xt2 = (x[num[jj + 1]] - x[num[jj]]) * f;
  352. if (xt2 == 0) {
  353. xt2 = rangemin / 2.0 / rangemax * f;
  354. xt1 -= xt2;
  355. }
  356. else if (xt1 * xt2 == 0) {
  357. xt1 = rangemin / 2.0 / rangemax * f;
  358. xt2 -= xt1;
  359. }
  360. /* calculate chi-square to indicate statistical significance of new class, i.e. how probable would it be that the new class could be the result of purely random choice */
  361. if (chi2 > pow((double)((no1 - no2) - (xt1 - xt2)), 2) / (xt1 + xt2))
  362. chi2 = pow((double)((no1 - no2) - (xt1 - xt2)), 2) / (xt1 + xt2);
  363. }
  364. /* Fill up classbreaks of i <=nbclass classes */
  365. for (j = 0; j <= (i - 1); j++)
  366. classbreaks[j] = zz[j + 1];
  367. return (chi2);
  368. }
  369. int class_frequencies(double *data, int count, int nbreaks,
  370. double *classbreaks, int *frequencies)
  371. {
  372. int i, j;
  373. double min, max;
  374. min = data[0];
  375. max = data[count - 1];
  376. /* count cases in all classes, except for last class */
  377. i = 0;
  378. for (j = 0; j < nbreaks; j++) {
  379. while (data[i] <= classbreaks[j]) {
  380. frequencies[j]++;
  381. i++;
  382. }
  383. }
  384. /*Now count cases in last class */
  385. for (i = i; i < count; i++) {
  386. frequencies[nbreaks]++;
  387. }
  388. return (1);
  389. }