class.c 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471
  1. /* functions to classify sorted arrays of doubles and fill a vector of classbreaks */
  2. #include <grass/glocale.h>
  3. #include <grass/arraystats.h>
  4. int AS_option_to_algorithm(const struct Option * option)
  5. {
  6. if (G_strcasecmp(option->answer, "int") == 0)
  7. return CLASS_INTERVAL;
  8. if (G_strcasecmp(option->answer, "std") == 0)
  9. return CLASS_STDEV;
  10. if (G_strcasecmp(option->answer, "qua") == 0)
  11. return CLASS_QUANT;
  12. if (G_strcasecmp(option->answer, "equ") == 0)
  13. return CLASS_EQUIPROB;
  14. if (G_strcasecmp(option->answer, "dis") == 0)
  15. return CLASS_DISCONT;
  16. G_fatal_error(_("Unknown algorithm '%s'"), option->answer);
  17. }
  18. double AS_class_apply_algorithm(int algo, double *data, int nrec, int *nbreaks,
  19. double *classbreaks)
  20. {
  21. double finfo = 0.0;
  22. switch (algo) {
  23. case CLASS_INTERVAL:
  24. finfo = AS_class_interval(data, nrec, *nbreaks, classbreaks);
  25. break;
  26. case CLASS_STDEV:
  27. finfo = AS_class_stdev(data, nrec, *nbreaks, classbreaks);
  28. break;
  29. case CLASS_QUANT:
  30. finfo = AS_class_quant(data, nrec, *nbreaks, classbreaks);
  31. break;
  32. case CLASS_EQUIPROB:
  33. finfo = AS_class_equiprob(data, nrec, nbreaks, classbreaks);
  34. break;
  35. case CLASS_DISCONT:
  36. /* finfo = class_discont(data, nrec, *nbreaks, classbreaks); disabled because of bugs */
  37. G_fatal_error(_("Discont algorithm currently not available because of bugs"));
  38. break;
  39. default:
  40. break;
  41. }
  42. if (finfo == 0)
  43. G_fatal_error(_("Classification algorithm failed"));
  44. return finfo;
  45. }
  46. int AS_class_interval(double *data, int count, int nbreaks, double *classbreaks)
  47. {
  48. double min, max;
  49. double step;
  50. int i = 0;
  51. min = data[0];
  52. max = data[count - 1];
  53. step = (max - min) / (nbreaks + 1);
  54. for (i = 0; i < nbreaks; i++)
  55. classbreaks[i] = min + (step * (i + 1));
  56. return (1);
  57. }
  58. double AS_class_stdev(double *data, int count, int nbreaks, double *classbreaks)
  59. {
  60. struct GASTATS stats;
  61. int i;
  62. int nbclass;
  63. double scale = 1.0;
  64. AS_basic_stats(data, count, &stats);
  65. nbclass = nbreaks + 1;
  66. if (nbclass % 2 == 1) { /* number of classes is uneven so we center middle class on mean */
  67. /* find appropriate fraction of stdev for step */
  68. i = 1;
  69. while (i) {
  70. if (((stats.mean + stats.stdev * scale / 2) +
  71. (stats.stdev * scale * (nbclass / 2 - 1)) > stats.max) ||
  72. ((stats.mean - stats.stdev * scale / 2) -
  73. (stats.stdev * scale * (nbclass / 2 - 1)) < stats.min))
  74. scale = scale / 2;
  75. else
  76. i = 0;
  77. }
  78. /* classbreaks below the mean */
  79. for (i = 0; i < nbreaks / 2; i++)
  80. classbreaks[i] =
  81. (stats.mean - stats.stdev * scale / 2) -
  82. stats.stdev * scale * (nbreaks / 2 - (i + 1));
  83. /* classbreaks above the mean */
  84. for (i = i; i < nbreaks; i++)
  85. classbreaks[i] =
  86. (stats.mean + stats.stdev * scale / 2) +
  87. stats.stdev * scale * (i - nbreaks / 2);
  88. }
  89. else { /* number of classes is even so mean is a classbreak */
  90. /* decide whether to use 1*stdev or 0.5*stdev as step */
  91. i = 1;
  92. while (i) {
  93. if (((stats.mean) + (stats.stdev * scale * (nbclass / 2 - 1)) >
  94. stats.max) ||
  95. ((stats.mean) - (stats.stdev * scale * (nbclass / 2 - 1)) <
  96. stats.min))
  97. scale = scale / 2;
  98. else
  99. i = 0;
  100. }
  101. /* classbreaks below the mean and on the mean */
  102. for (i = 0; i <= nbreaks / 2; i++)
  103. classbreaks[i] =
  104. stats.mean - stats.stdev * scale * (nbreaks / 2 - i);
  105. /* classbreaks above the mean */
  106. for (i = i; i < nbreaks; i++)
  107. classbreaks[i] =
  108. stats.mean + stats.stdev * scale * (i - nbreaks / 2);
  109. }
  110. return (scale);
  111. }
  112. int AS_class_quant(double *data, int count, int nbreaks, double *classbreaks)
  113. {
  114. int i, step;
  115. step = count / (nbreaks + 1);
  116. for (i = 0; i < nbreaks; i++)
  117. classbreaks[i] = data[step * (i + 1)];
  118. return (1);
  119. }
  120. int AS_class_equiprob(double *data, int count, int *nbreaks, double *classbreaks)
  121. {
  122. int i, j;
  123. double *lequi; /*Vector of scale factors for probabilities of the normal distribution */
  124. struct GASTATS stats;
  125. int nbclass;
  126. nbclass = *nbreaks + 1;
  127. lequi = G_malloc(*nbreaks * sizeof(double));
  128. /* The following values come from the normal distribution and will be used as:
  129. * classbreak[i] = (lequi[i] * stdev) + mean;
  130. */
  131. if (nbclass < 3) {
  132. lequi[0] = 0;
  133. }
  134. else if (nbclass == 3) {
  135. lequi[0] = -0.43076;
  136. lequi[1] = 0.43076;
  137. }
  138. else if (nbclass == 4) {
  139. lequi[0] = -0.6745;
  140. lequi[1] = 0;
  141. lequi[2] = 0.6745;
  142. }
  143. else if (nbclass == 5) {
  144. lequi[0] = -0.8416;
  145. lequi[1] = -0.2533;
  146. lequi[2] = 0.2533;
  147. lequi[3] = 0.8416;
  148. }
  149. else if (nbclass == 6) {
  150. lequi[0] = -0.9676;
  151. lequi[1] = -0.43076;
  152. lequi[2] = 0;
  153. lequi[3] = 0.43076;
  154. lequi[4] = 0.9676;
  155. }
  156. else if (nbclass == 7) {
  157. lequi[0] = -1.068;
  158. lequi[1] = -0.566;
  159. lequi[2] = -0.18;
  160. lequi[3] = 0.18;
  161. lequi[4] = 0.566;
  162. lequi[5] = 1.068;
  163. }
  164. else if (nbclass == 8) {
  165. lequi[0] = -1.1507;
  166. lequi[1] = -0.6745;
  167. lequi[2] = -0.3187;
  168. lequi[3] = 0;
  169. lequi[4] = 0.3187;
  170. lequi[5] = 0.6745;
  171. lequi[6] = 1.1507;
  172. }
  173. else if (nbclass == 9) {
  174. lequi[0] = -1.2208;
  175. lequi[1] = -0.7648;
  176. lequi[2] = -0.4385;
  177. lequi[3] = -0.1397;
  178. lequi[4] = 0.1397;
  179. lequi[5] = 0.4385;
  180. lequi[6] = 0.7648;
  181. lequi[7] = 1.2208;
  182. }
  183. else if (nbclass == 10) {
  184. lequi[0] = -1.28155;
  185. lequi[1] = -0.84162;
  186. lequi[2] = -0.5244;
  187. lequi[3] = -0.25335;
  188. lequi[4] = 0;
  189. lequi[5] = 0.25335;
  190. lequi[6] = 0.5244;
  191. lequi[7] = 0.84162;
  192. lequi[8] = 1.28155;
  193. }
  194. else {
  195. G_fatal_error(_("Equiprobable classbreaks currently limited to 10 classes"));
  196. }
  197. AS_basic_stats(data, count, &stats);
  198. /* Check if any of the classbreaks would fall outside of the range min-max */
  199. j = 0;
  200. for (i = 0; i < *nbreaks; i++) {
  201. if ((lequi[i] * stats.stdev + stats.mean) >= stats.min &&
  202. (lequi[i] * stats.stdev) + stats.mean <= stats.max) {
  203. j++;
  204. }
  205. }
  206. if (j < (*nbreaks)) {
  207. G_warning(_("There are classbreaks outside the range min-max. Number of "
  208. "classes reduced to %i, but using probabilities for %i classes."),
  209. j + 1, *nbreaks + 1);
  210. G_realloc(classbreaks, j * sizeof(double));
  211. for (i = 0; i < j; i++)
  212. classbreaks[i] = 0;
  213. }
  214. j = 0;
  215. for (i = 0; i < *nbreaks; i++) {
  216. if ((lequi[i] * stats.stdev + stats.mean) >= stats.min &&
  217. (lequi[i] * stats.stdev) + stats.mean <= stats.max) {
  218. classbreaks[j] = lequi[i] * stats.stdev + stats.mean;
  219. j++;
  220. }
  221. }
  222. *nbreaks = j;
  223. G_free(lequi);
  224. return (1);
  225. }
  226. /* FIXME: there seems to a problem with array overflow, probably due to
  227. the fact that the code was ported from fortran which has 1-based arrays*/
  228. double AS_class_discont(double *data, int count, int nbreaks,
  229. double *classbreaks)
  230. {
  231. int *num, nbclass;
  232. double *no, *zz, *nz, *xn, *co;
  233. double *x; /* Vector standardized observations */
  234. int i, j, k;
  235. double min = 0, max = 0, rangemax = 0;
  236. int n = 0;
  237. double rangemin = 0, xlim = 0;
  238. double dmax = 0.0, d2 = 0.0, dd = 0.0, p = 0.0;
  239. int nf = 0, nmax = 0;
  240. double *abc;
  241. int nd = 0;
  242. double den = 0, d = 0;
  243. int im = 0, ji = 0;
  244. int tmp = 0;
  245. int nff = 0, jj = 0, no1 = 0, no2 = 0;
  246. double f = 0, xt1 = 0, xt2 = 0, chi2 = 1000.0, xnj_1 = 0, xj_1 = 0;
  247. /*get the number of values */
  248. n = count;
  249. nbclass = nbreaks + 1;
  250. num = G_malloc((nbclass + 1) * sizeof(int));
  251. no = G_malloc((nbclass + 1) * sizeof(double));
  252. zz = G_malloc((nbclass + 1) * sizeof(double));
  253. nz = G_malloc(3 * sizeof(double));
  254. xn = G_malloc((n + 1) * sizeof(double));
  255. co = G_malloc((nbclass + 1) * sizeof(double));
  256. /* We copy the array of values to x, in order to be able to standardize it */
  257. x = G_malloc((n + 1) * sizeof(double));
  258. x[0] = n;
  259. xn[0] = 0;
  260. min = data[0];
  261. max = data[count - 1];
  262. for (i = 1; i <= n; i++)
  263. x[i] = data[i - 1];
  264. rangemax = max - min;
  265. rangemin = rangemax;
  266. for (i = 2; i <= n; i++) {
  267. if (x[i] != x[i - 1] && x[i] - x[i - 1] < rangemin)
  268. rangemin = x[i] - x[i - 1]; /* rangemin = minimal distance */
  269. }
  270. /* STANDARDIZATION
  271. * and creation of the number vector (xn) */
  272. for (i = 1; i <= n; i++) {
  273. x[i] = (x[i] - min) / rangemax;
  274. xn[i] = i / (double)n;
  275. }
  276. xlim = rangemin / rangemax;
  277. rangemin = rangemin / 2.0;
  278. /* Searching for the limits */
  279. num[1] = n;
  280. abc = G_malloc(3 * sizeof(double));
  281. /* Loop through possible solutions */
  282. for (i = 1; i <= nbclass; i++) {
  283. nmax = 0;
  284. dmax = 0.0;
  285. d2 = 0.0;
  286. nf = 0; /*End number */
  287. /* Loop through classes */
  288. for (j = 1; j <= i; j++) {
  289. nd = nf; /*Start number */
  290. nf = num[j];
  291. co[j] = 10e37;
  292. AS_eqdrt(x, xn, nd, nf, abc);
  293. den = sqrt(pow(abc[1], 2) + 1.0);
  294. nd++;
  295. /* Loop through observations */
  296. for (k = nd; k <= nf; k++) {
  297. if (abc[2] == 0.0)
  298. d = fabs((-1.0 * abc[1] * x[k]) + xn[k] - abc[0]) / den;
  299. else
  300. d = fabs(x[k] - abc[2]);
  301. d2 += pow(d, 2);
  302. if (x[k] - x[nd] < xlim)
  303. continue;
  304. if (x[nf] - x[k] < xlim)
  305. continue;
  306. if (d <= dmax)
  307. continue;
  308. dmax = d;
  309. nmax = k;
  310. }
  311. nd--; /* A VERIFIER! */
  312. if (x[nf] != x[nd]) {
  313. if (nd != 0)
  314. co[j] = (xn[nf] - xn[nd]) / (x[nf] - x[nd]);
  315. else
  316. co[j] = (xn[nf]) / (x[nf]); /* A VERIFIER! */
  317. }
  318. }
  319. if (i == 1)
  320. dd = d2;
  321. p = d2 / dd;
  322. for (j = 1; j <= i; j++) {
  323. no[j] = num[j];
  324. zz[j] = x[num[j]] * rangemax + min;
  325. if (j == i)
  326. continue;
  327. if (co[j] > co[j + 1]) {
  328. zz[j] = zz[j] + rangemin;
  329. continue;
  330. }
  331. zz[j] = zz[j] - rangemin;
  332. no[j] = no[j] - 1;
  333. }
  334. im = i - 1;
  335. if (im != 0.0) {
  336. for (j = 1; j <= im; j++) {
  337. ji = i + 1 - j;
  338. no[ji] -= no[ji - 1];
  339. }
  340. }
  341. if (nmax == 0) {
  342. break;
  343. }
  344. nff = i + 2;
  345. tmp = 0;
  346. for (j = 1; j <= i; j++) {
  347. jj = nff - j;
  348. if (num[jj - 1] < nmax) {
  349. num[jj] = nmax;
  350. tmp = 1;
  351. break;
  352. }
  353. num[jj] = num[jj - 1];
  354. }
  355. if (tmp == 0) {
  356. num[1] = nmax;
  357. jj = 1;
  358. }
  359. if (jj == 1) {
  360. xnj_1 = 0;
  361. xj_1 = 0;
  362. }
  363. else {
  364. xnj_1 = xn[num[jj - 1]];
  365. xj_1 = x[num[jj - 1]];
  366. }
  367. no1 = (xn[num[jj]] - xnj_1) * n;
  368. no2 = (xn[num[jj + 1]] - xn[num[jj]]) * n;
  369. f = (xn[num[jj + 1]] - xnj_1) / (x[num[jj + 1]] - xj_1);
  370. f *= n;
  371. xt1 = (x[num[jj]] - xj_1) * f;
  372. xt2 = (x[num[jj + 1]] - x[num[jj]]) * f;
  373. if (xt2 == 0) {
  374. xt2 = rangemin / 2.0 / rangemax * f;
  375. xt1 -= xt2;
  376. }
  377. else if (xt1 * xt2 == 0) {
  378. xt1 = rangemin / 2.0 / rangemax * f;
  379. xt2 -= xt1;
  380. }
  381. /* calculate chi-square to indicate statistical significance of new class, i.e. how probable would it be that the new class could be the result of purely random choice */
  382. if (chi2 > pow((double)((no1 - no2) - (xt1 - xt2)), 2) / (xt1 + xt2))
  383. chi2 = pow((double)((no1 - no2) - (xt1 - xt2)), 2) / (xt1 + xt2);
  384. }
  385. /* Fill up classbreaks of i <=nbclass classes */
  386. for (j = 0; j <= (i - 1); j++)
  387. classbreaks[j] = zz[j + 1];
  388. return (chi2);
  389. }
  390. int AS_class_frequencies(double *data, int count, int nbreaks,
  391. double *classbreaks, int *frequencies)
  392. {
  393. int i, j;
  394. double min, max;
  395. min = data[0];
  396. max = data[count - 1];
  397. /* count cases in all classes, except for last class */
  398. i = 0;
  399. for (j = 0; j < nbreaks; j++) {
  400. while (data[i] <= classbreaks[j]) {
  401. frequencies[j]++;
  402. i++;
  403. }
  404. }
  405. /*Now count cases in last class */
  406. for (i = i; i < count; i++) {
  407. frequencies[nbreaks]++;
  408. }
  409. return (1);
  410. }