main.c 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581
  1. /****************************************************************************
  2. *
  3. * MODULE: r.regression.multi
  4. *
  5. * AUTHOR(S): Markus Metz
  6. *
  7. * PURPOSE: Calculates multiple linear regression from raster maps:
  8. * y = b0 + b1*x1 + b2*x2 + ... + bn*xn + e
  9. *
  10. * COPYRIGHT: (C) 2011 by the GRASS Development Team
  11. *
  12. * This program is free software under the GNU General Public
  13. * License (>=v2). Read the file COPYING that comes with GRASS
  14. * for details.
  15. *
  16. *****************************************************************************/
  17. #include <stdio.h>
  18. #include <stdlib.h>
  19. #include <math.h>
  20. #include <string.h>
  21. #include <grass/gis.h>
  22. #include <grass/glocale.h>
  23. #include <grass/raster.h>
  24. struct MATRIX
  25. {
  26. int n; /* SIZE OF THIS MATRIX (N x N) */
  27. double *v;
  28. };
  29. #define M(m,row,col) (m)->v[((row) * ((m)->n)) + (col)]
  30. static int solvemat(struct MATRIX *m, double a[], double B[])
  31. {
  32. int i, j, i2, j2, imark;
  33. double factor, temp;
  34. double pivot; /* ACTUAL VALUE OF THE LARGEST PIVOT CANDIDATE */
  35. for (i = 0; i < m->n; i++) {
  36. j = i;
  37. /* find row with largest magnitude value for pivot value */
  38. pivot = M(m, i, j);
  39. imark = i;
  40. for (i2 = i + 1; i2 < m->n; i2++) {
  41. temp = fabs(M(m, i2, j));
  42. if (temp > fabs(pivot)) {
  43. pivot = M(m, i2, j);
  44. imark = i2;
  45. }
  46. }
  47. /* if the pivot is very small then the points are nearly co-linear */
  48. /* co-linear points result in an undefined matrix, and nearly */
  49. /* co-linear points results in a solution with rounding error */
  50. if (pivot == 0.0) {
  51. G_warning(_("Matrix is unsolvable"));
  52. return 0;
  53. }
  54. /* if row with highest pivot is not the current row, switch them */
  55. if (imark != i) {
  56. for (j2 = 0; j2 < m->n; j2++) {
  57. temp = M(m, imark, j2);
  58. M(m, imark, j2) = M(m, i, j2);
  59. M(m, i, j2) = temp;
  60. }
  61. temp = a[imark];
  62. a[imark] = a[i];
  63. a[i] = temp;
  64. }
  65. /* compute zeros above and below the pivot, and compute
  66. values for the rest of the row as well */
  67. for (i2 = 0; i2 < m->n; i2++) {
  68. if (i2 != i) {
  69. factor = M(m, i2, j) / pivot;
  70. for (j2 = j; j2 < m->n; j2++)
  71. M(m, i2, j2) -= factor * M(m, i, j2);
  72. a[i2] -= factor * a[i];
  73. }
  74. }
  75. }
  76. /* SINCE ALL OTHER VALUES IN THE MATRIX ARE ZERO NOW, CALCULATE THE
  77. COEFFICIENTS BY DIVIDING THE COLUMN VECTORS BY THE DIAGONAL VALUES. */
  78. for (i = 0; i < m->n; i++) {
  79. B[i] = a[i] / M(m, i, i);
  80. }
  81. return 1;
  82. }
  83. int main(int argc, char *argv[])
  84. {
  85. unsigned int r, c, rows, cols, n_valid; /* totals */
  86. int *mapx_fd, mapy_fd, mapres_fd, mapest_fd;
  87. int i, j, k, n_predictors;
  88. double *sumX, sumY, *sumsqX, sumsqY, *sumXY;
  89. double *meanX, meanY, *varX, varY, *sdX, sdY;
  90. double yest, yres; /* estimated y, residual */
  91. double sumYest, *SSerr_without;
  92. double SE;
  93. double meanYest, meanYres, varYest, varYres, sdYest, sdYres;
  94. double SStot, SSerr, SSreg;
  95. double SAE;
  96. double **a;
  97. struct MATRIX *m, *m_all;
  98. double **B, Rsq, Rsqadj, F, t, AIC, AICc, BIC;
  99. unsigned int count = 0;
  100. DCELL **mapx_buf, *mapy_buf, *mapx_val, mapy_val, *mapres_buf, *mapest_buf;
  101. char *name;
  102. struct Option *input_mapx, *input_mapy, *output_res, *output_est, *output_opt;
  103. struct Flag *shell_style;
  104. struct Cell_head region;
  105. struct GModule *module;
  106. G_gisinit(argv[0]);
  107. module = G_define_module();
  108. G_add_keyword(_("raster"));
  109. G_add_keyword(_("statistics"));
  110. G_add_keyword(_("regression"));
  111. module->description =
  112. _("Calculates multiple linear regression from raster maps.");
  113. /* Define the different options */
  114. input_mapx = G_define_standard_option(G_OPT_R_INPUTS);
  115. input_mapx->key = "mapx";
  116. input_mapx->description = (_("Map for x coefficient"));
  117. input_mapy = G_define_standard_option(G_OPT_R_INPUT);
  118. input_mapy->key = "mapy";
  119. input_mapy->description = (_("Map for y coefficient"));
  120. output_res = G_define_standard_option(G_OPT_R_OUTPUT);
  121. output_res->key = "residuals";
  122. output_res->required = NO;
  123. output_res->description = (_("Map to store residuals"));
  124. output_est = G_define_standard_option(G_OPT_R_OUTPUT);
  125. output_est->key = "estimates";
  126. output_est->required = NO;
  127. output_est->description = (_("Map to store estimates"));
  128. output_opt = G_define_standard_option(G_OPT_F_OUTPUT);
  129. output_opt->key = "output";
  130. output_opt->required = NO;
  131. output_opt->description =
  132. (_("ASCII file for storing regression coefficients (output to screen if file not specified)."));
  133. shell_style = G_define_flag();
  134. shell_style->key = 'g';
  135. shell_style->description = _("Print in shell script style");
  136. if (G_parser(argc, argv))
  137. exit(EXIT_FAILURE);
  138. name = output_opt->answer;
  139. if (name != NULL && strcmp(name, "-") != 0) {
  140. if (NULL == freopen(name, "w", stdout)) {
  141. G_fatal_error(_("Unable to open file <%s> for writing"), name);
  142. }
  143. }
  144. G_get_window(&region);
  145. rows = region.rows;
  146. cols = region.cols;
  147. /* count x maps */
  148. for (i = 0; input_mapx->answers[i]; i++);
  149. n_predictors = i;
  150. /* allocate memory for x maps */
  151. mapx_fd = (int *)G_malloc(n_predictors * sizeof(int));
  152. sumX = (double *)G_malloc(n_predictors * sizeof(double));
  153. sumsqX = (double *)G_malloc(n_predictors * sizeof(double));
  154. sumXY = (double *)G_malloc(n_predictors * sizeof(double));
  155. SSerr_without = (double *)G_malloc(n_predictors * sizeof(double));
  156. meanX = (double *)G_malloc(n_predictors * sizeof(double));
  157. varX = (double *)G_malloc(n_predictors * sizeof(double));
  158. sdX = (double *)G_malloc(n_predictors * sizeof(double));
  159. mapx_buf = (DCELL **)G_malloc(n_predictors * sizeof(DCELL *));
  160. mapx_val = (DCELL *)G_malloc((n_predictors + 1) * sizeof(DCELL));
  161. /* ordinary least squares */
  162. m = NULL;
  163. m_all = (struct MATRIX *)G_malloc((n_predictors + 1) * sizeof(struct MATRIX));
  164. a = (double **)G_malloc((n_predictors + 1) * sizeof(double *));
  165. B = (double **)G_malloc((n_predictors + 1) * sizeof(double *));
  166. m = &(m_all[0]);
  167. m->n = n_predictors + 1;
  168. m->v = (double *)G_malloc(m->n * m->n * sizeof(double));
  169. a[0] = (double *)G_malloc(m->n * sizeof(double));
  170. B[0] = (double *)G_malloc(m->n * sizeof(double));
  171. for (i = 0; i < m->n; i++) {
  172. for (j = i; j < m->n; j++)
  173. M(m, i, j) = 0.0;
  174. a[0][i] = 0.0;
  175. B[0][i] = 0.0;
  176. }
  177. for (k = 1; k <= n_predictors; k++) {
  178. m = &(m_all[k]);
  179. m->n = n_predictors;
  180. m->v = (double *)G_malloc(m->n * m->n * sizeof(double));
  181. a[k] = (double *)G_malloc(m->n * sizeof(double));
  182. B[k] = (double *)G_malloc(m->n * sizeof(double));
  183. for (i = 0; i < m->n; i++) {
  184. for (j = i; j < m->n; j++)
  185. M(m, i, j) = 0.0;
  186. a[k][i] = 0.0;
  187. B[k][i] = 0.0;
  188. }
  189. }
  190. /* open maps */
  191. G_debug(1, "open maps");
  192. for (i = 0; i < n_predictors; i++) {
  193. mapx_fd[i] = Rast_open_old(input_mapx->answers[i], "");
  194. }
  195. mapy_fd = Rast_open_old(input_mapy->answer, "");
  196. for (i = 0; i < n_predictors; i++)
  197. mapx_buf[i] = Rast_allocate_d_buf();
  198. mapy_buf = Rast_allocate_d_buf();
  199. for (i = 0; i < n_predictors; i++) {
  200. sumX[i] = sumsqX[i] = sumXY[i] = 0.0;
  201. meanX[i] = varX[i] = sdX[i] = 0.0;
  202. SSerr_without[i] = 0.0;
  203. }
  204. sumY = sumsqY = meanY = varY = sdY = 0.0;
  205. sumYest = meanYest = varYest = sdYest = 0.0;
  206. meanYres = varYres = sdYres = 0.0;
  207. /* read input maps */
  208. G_message(_("First pass..."));
  209. n_valid = 0;
  210. mapx_val[0] = 1.0;
  211. for (r = 0; r < rows; r++) {
  212. G_percent(r, rows, 2);
  213. for (i = 0; i < n_predictors; i++)
  214. Rast_get_d_row(mapx_fd[i], mapx_buf[i], r);
  215. Rast_get_d_row(mapy_fd, mapy_buf, r);
  216. for (c = 0; c < cols; c++) {
  217. int isnull = 0;
  218. for (i = 0; i < n_predictors; i++) {
  219. mapx_val[i + 1] = mapx_buf[i][c];
  220. if (Rast_is_d_null_value(&(mapx_val[i + 1]))) {
  221. isnull = 1;
  222. break;
  223. }
  224. }
  225. if (isnull)
  226. continue;
  227. mapy_val = mapy_buf[c];
  228. if (Rast_is_d_null_value(&mapy_val))
  229. continue;
  230. for (i = 0; i <= n_predictors; i++) {
  231. double val1 = mapx_val[i];
  232. for (j = i; j <= n_predictors; j++) {
  233. double val2 = mapx_val[j];
  234. m = &(m_all[0]);
  235. M(m, i, j) += val1 * val2;
  236. /* linear model without predictor k */
  237. for (k = 1; k <= n_predictors; k++) {
  238. if (k != i && k != j) {
  239. int i2 = k > i ? i : i - 1;
  240. int j2 = k > j ? j : j - 1;
  241. m = &(m_all[k]);
  242. M(m, i2, j2) += val1 * val2;
  243. }
  244. }
  245. }
  246. a[0][i] += mapy_val * val1;
  247. for (k = 1; k <= n_predictors; k++) {
  248. if (k != i) {
  249. int i2 = k > i ? i : i - 1;
  250. a[k][i2] += mapy_val * val1;
  251. }
  252. }
  253. if (i > 0) {
  254. sumX[i - 1] += val1;
  255. sumsqX[i - 1] += val1 * val1;
  256. sumXY[i - 1] += val1 * mapy_val;
  257. }
  258. }
  259. sumY += mapy_val;
  260. sumsqY += mapy_val * mapy_val;
  261. count++;
  262. }
  263. }
  264. G_percent(rows, rows, 2);
  265. if (count < n_predictors + 1)
  266. G_fatal_error(_("Not enough valid cells available"));
  267. for (k = 0; k <= n_predictors; k++) {
  268. m = &(m_all[k]);
  269. /* TRANSPOSE VALUES IN UPPER HALF OF M TO OTHER HALF */
  270. for (i = 1; i < m->n; i++)
  271. for (j = 0; j < i; j++)
  272. M(m, i, j) = M(m, j, i);
  273. if (!solvemat(m, a[k], B[k])) {
  274. for (i = 0; i <= n_predictors; i++) {
  275. fprintf(stdout, "b%d=0.0\n", i);
  276. }
  277. G_fatal_error(_("Multiple regression failed"));
  278. }
  279. }
  280. /* second pass */
  281. G_message(_("Second pass..."));
  282. /* residuals output */
  283. if (output_res->answer) {
  284. mapres_fd = Rast_open_new(output_res->answer, DCELL_TYPE);
  285. mapres_buf = Rast_allocate_d_buf();
  286. }
  287. else {
  288. mapres_fd = -1;
  289. mapres_buf = NULL;
  290. }
  291. /* estimates output */
  292. if (output_est->answer) {
  293. mapest_fd = Rast_open_new(output_est->answer, DCELL_TYPE);
  294. mapest_buf = Rast_allocate_d_buf();
  295. }
  296. else {
  297. mapest_fd = -1;
  298. mapest_buf = NULL;
  299. }
  300. for (i = 0; i < n_predictors; i++)
  301. meanX[i] = sumX[i] / count;
  302. meanY = sumY / count;
  303. SStot = SSerr = SSreg = 0.0;
  304. SAE = 0.0;
  305. for (r = 0; r < rows; r++) {
  306. G_percent(r, rows, 2);
  307. for (i = 0; i < n_predictors; i++)
  308. Rast_get_d_row(mapx_fd[i], mapx_buf[i], r);
  309. Rast_get_d_row(mapy_fd, mapy_buf, r);
  310. if (mapres_buf)
  311. Rast_set_d_null_value(mapres_buf, cols);
  312. if (mapest_buf)
  313. Rast_set_d_null_value(mapest_buf, cols);
  314. for (c = 0; c < cols; c++) {
  315. int isnull = 0;
  316. for (i = 0; i < n_predictors; i++) {
  317. mapx_val[i + 1] = mapx_buf[i][c];
  318. if (Rast_is_d_null_value(&(mapx_val[i + 1]))) {
  319. isnull = 1;
  320. break;
  321. }
  322. }
  323. if (isnull)
  324. continue;
  325. yest = 0.0;
  326. for (i = 0; i <= n_predictors; i++) {
  327. yest += B[0][i] * mapx_val[i];
  328. }
  329. if (mapest_buf)
  330. mapest_buf[c] = yest;
  331. mapy_val = mapy_buf[c];
  332. if (Rast_is_d_null_value(&mapy_val))
  333. continue;
  334. yres = mapy_val - yest;
  335. if (mapres_buf)
  336. mapres_buf[c] = yres;
  337. SStot += (mapy_val - meanY) * (mapy_val - meanY);
  338. SSreg += (yest - meanY) * (yest - meanY);
  339. SSerr += yres * yres;
  340. SAE += fabs(yres);
  341. for (k = 1; k <= n_predictors; k++) {
  342. double yesti = 0.0;
  343. double yresi;
  344. /* linear model without predictor k */
  345. for (i = 0; i <= n_predictors; i++) {
  346. if (i != k) {
  347. j = k > i ? i : i - 1;
  348. yesti += B[k][j] * mapx_val[i];
  349. }
  350. }
  351. yresi = mapy_val - yesti;
  352. /* linear model without predictor k */
  353. SSerr_without[k - 1] += yresi * yresi;
  354. varX[k - 1] = (mapx_val[k] - meanX[k - 1]) * (mapx_val[k] - meanX[k - 1]);
  355. }
  356. }
  357. if (mapres_buf)
  358. Rast_put_d_row(mapres_fd, mapres_buf);
  359. if (mapest_buf)
  360. Rast_put_d_row(mapest_fd, mapest_buf);
  361. }
  362. G_percent(rows, rows, 2);
  363. fprintf(stdout, "n=%d\n", count);
  364. /* coefficient of determination aka R squared */
  365. Rsq = 1 - (SSerr / SStot);
  366. fprintf(stdout, "Rsq=%f\n", Rsq);
  367. /* adjusted coefficient of determination */
  368. Rsqadj = 1 - ((SSerr * (count - 1)) / (SStot * (count - n_predictors - 1)));
  369. fprintf(stdout, "Rsqadj=%f\n", Rsqadj);
  370. /* RMSE */
  371. fprintf(stdout, "RMSE=%f\n", sqrt(SSerr / count));
  372. /* MAE */
  373. fprintf(stdout, "MAE=%f\n", SAE / count);
  374. /* F statistic */
  375. /* F = ((SStot - SSerr) / (n_predictors)) / (SSerr / (count - n_predictors));
  376. * , or: */
  377. F = ((SStot - SSerr) * (count - n_predictors - 1)) / (SSerr * (n_predictors));
  378. fprintf(stdout, "F=%f\n", F);
  379. i = 0;
  380. /* constant aka estimate for intercept in R */
  381. fprintf(stdout, "b%d=%f\n", i, B[0][i]);
  382. /* t score for R squared of the full model, unused */
  383. t = sqrt(Rsq) * sqrt((count - 2) / (1 - Rsq));
  384. /*
  385. fprintf(stdout, "t%d=%f\n", i, t);
  386. */
  387. /* AIC, corrected AIC, and BIC information criteria for the full model */
  388. AIC = count * log(SSerr / count) + 2 * (n_predictors + 1);
  389. fprintf(stdout, "AIC=%f\n", AIC);
  390. AICc = AIC + (2 * n_predictors * (n_predictors + 1)) / (count - n_predictors - 1);
  391. fprintf(stdout, "AICc=%f\n", AICc);
  392. BIC = count * log(SSerr / count) + log(count) * (n_predictors + 1);
  393. fprintf(stdout, "BIC=%f\n", BIC);
  394. /* error variance of the model, identical to R */
  395. SE = SSerr / (count - n_predictors - 1);
  396. /*
  397. fprintf(stdout, "SE=%f\n", SE);
  398. fprintf(stdout, "SSerr=%f\n", SSerr);
  399. */
  400. for (i = 0; i < n_predictors; i++) {
  401. fprintf(stdout, "\npredictor%d=%s\n", i + 1, input_mapx->answers[i]);
  402. fprintf(stdout, "b%d=%f\n", i + 1, B[0][i + 1]);
  403. if (n_predictors > 1) {
  404. double Rsqi, SEi, sumsqX_corr;
  405. /* corrected sum of squares for predictor [i] */
  406. sumsqX_corr = sumsqX[i] - sumX[i] * sumX[i] / (count - n_predictors - 1);
  407. /* standard error SE for predictor [i] */
  408. /* SE[i] with only one predictor: sqrt(SE / sumsqX_corr)
  409. * this does not work with more than one predictor */
  410. /* in R, SEi is sqrt(diag(R) * resvar) with
  411. * R = ???
  412. * resvar = rss / rdf = SE global
  413. * rss = sum of squares of the residuals
  414. * rdf = residual degrees of freedom = count - n_predictors - 1 */
  415. SEi = sqrt(SE / (Rsq * sumsqX_corr));
  416. /*
  417. fprintf(stdout, "SE%d=%f\n", i + 1, SEi);
  418. */
  419. /* Sum of squares for predictor [i] */
  420. /*
  421. fprintf(stdout, "SSerr%d=%f\n", i + 1, SSerr_without[i] - SSerr);
  422. */
  423. /* R squared of the model without predictor [i] */
  424. /* Rsqi = 1 - SSerr_without[i] / SStot; */
  425. /* the additional amount of variance explained
  426. * when including predictor [i] :
  427. * Rsq - Rsqi */
  428. Rsqi = (SSerr_without[i] - SSerr) / SStot;
  429. fprintf(stdout, "Rsq%d=%f\n", i + 1, Rsqi);
  430. /* t score for Student's t distribution, unused */
  431. t = (B[0][i + 1]) / SEi;
  432. /*
  433. fprintf(stdout, "t%d=%f\n", i + 1, t);
  434. */
  435. /* F score for Fisher's F distribution
  436. * here: F score to test if including predictor [i]
  437. * yields a significant improvement
  438. * after Lothar Sachs, Angewandte Statistik:
  439. * F = (Rsq - Rsqi) * (count - n_predictors - 1) / (1 - Rsq) */
  440. /* same like Sumsq / SE */
  441. /* same like (SSerr_without[i] / SSerr - 1) * (count - n_predictors - 1) */
  442. /* same like R-stats when entered in R-stats as last predictor */
  443. F = (SSerr_without[i] / SSerr - 1) * (count - n_predictors - 1);
  444. fprintf(stdout, "F%d=%f\n", i + 1, F);
  445. /* AIC, corrected AIC, and BIC information criteria for
  446. * the model without predictor [i] */
  447. AIC = count * log(SSerr_without[i] / count) + 2 * (n_predictors);
  448. fprintf(stdout, "AIC%d=%f\n", i + 1, AIC);
  449. AICc = AIC + (2 * (n_predictors - 1) * n_predictors) / (count - n_predictors - 2);
  450. fprintf(stdout, "AICc%d=%f\n", i + 1, AICc);
  451. BIC = count * log(SSerr_without[i] / count) + (n_predictors - 1) * log(count);
  452. fprintf(stdout, "BIC%d=%f\n", i + 1, BIC);
  453. }
  454. }
  455. for (i = 0; i < n_predictors; i++) {
  456. Rast_close(mapx_fd[i]);
  457. G_free(mapx_buf[i]);
  458. }
  459. Rast_close(mapy_fd);
  460. G_free(mapy_buf);
  461. if (mapres_fd > -1) {
  462. struct History history;
  463. Rast_close(mapres_fd);
  464. G_free(mapres_buf);
  465. Rast_short_history(output_res->answer, "raster", &history);
  466. Rast_command_history(&history);
  467. Rast_write_history(output_res->answer, &history);
  468. }
  469. if (mapest_fd > -1) {
  470. struct History history;
  471. Rast_close(mapest_fd);
  472. G_free(mapest_buf);
  473. Rast_short_history(output_est->answer, "raster", &history);
  474. Rast_command_history(&history);
  475. Rast_write_history(output_est->answer, &history);
  476. }
  477. exit(EXIT_SUCCESS);
  478. }