main.c 30 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288
  1. /****************************************************************
  2. *
  3. * MODULE: v.cluster
  4. *
  5. * AUTHOR(S): Markus Metz
  6. *
  7. * PURPOSE: Identifies clusters in a point cloud
  8. *
  9. * COPYRIGHT: (C) 2015 by the GRASS Development Team
  10. *
  11. * This program is free software under the
  12. * GNU General Public License (>=v2).
  13. * Read the file COPYING that comes with GRASS
  14. * for details.
  15. *
  16. ****************************************************************/
  17. #include <stdio.h>
  18. #include <stdlib.h>
  19. #include <string.h>
  20. #include <math.h>
  21. #include <grass/gis.h>
  22. #include <grass/vector.h>
  23. #include <grass/glocale.h>
  24. #include <grass/kdtree.h>
  25. #ifdef MAX
  26. #undef MAX
  27. #endif
  28. #define MAX(a, b) ((a) > (b) ? (a) : (b))
  29. #define CL_DBSCAN 1
  30. #define CL_DBSCAN2 2
  31. #define CL_DENSE 3
  32. #define CL_OPTICS 4
  33. #define CL_OPTICS2 5
  34. #define GET_PARENT(p, c) ((p) = (int) (((c) - 2) / 3 + 1))
  35. #define GET_CHILD(c, p) ((c) = (int) (((p) * 3) - 1))
  36. struct cl_pnt
  37. {
  38. int uid;
  39. int prevpnt;
  40. double cd;
  41. double reach;
  42. double c[3];
  43. };
  44. static struct cl_pnt *clp;
  45. static int *heapidx;
  46. static int heapsize;
  47. int add_pt(int idx);
  48. int drop_pt(void);
  49. int main(int argc, char *argv[])
  50. {
  51. struct Map_info In, Out;
  52. struct line_pnts *Points;
  53. struct line_cats *Cats;
  54. int i, j, type, cat, is3d;
  55. struct GModule *module;
  56. struct Option *input, *output, *lyr_opt;
  57. struct Option *dist_opt, *min_opt, *method_opt;
  58. struct Flag *flag_2d, *flag_topo, *flag_attr;
  59. int clayer;
  60. int npoints, nlines;
  61. int *cid, *idx, *renumber, OLD, NEW;
  62. int nclusters, noutliers;
  63. struct kdtree *kdt;
  64. struct kdtrav trav;
  65. double c[3];
  66. int uid;
  67. double eps;
  68. int ndims, minpnts;
  69. int clmethod;
  70. double *kddist;
  71. int kdfound, *kduid;
  72. /* initialize GIS environment */
  73. /* reads grass env, stores program name to G_program_name() */
  74. G_gisinit(argv[0]);
  75. /* initialize module */
  76. module = G_define_module();
  77. G_add_keyword(_("vector"));
  78. G_add_keyword(_("point cloud"));
  79. G_add_keyword(_("cluster"));
  80. G_add_keyword(_("clump"));
  81. G_add_keyword(_("level1"));
  82. module->description = _("Performs cluster identification.");
  83. /* Define the different options as defined in gis.h */
  84. input = G_define_standard_option(G_OPT_V_INPUT);
  85. output = G_define_standard_option(G_OPT_V_OUTPUT);
  86. lyr_opt = G_define_standard_option(G_OPT_V_FIELD);
  87. lyr_opt->label = _("Layer number or name for cluster ids");
  88. lyr_opt->answer = "2";
  89. dist_opt = G_define_option();
  90. dist_opt->type = TYPE_DOUBLE;
  91. dist_opt->key = "distance";
  92. dist_opt->required = NO;
  93. dist_opt->label = _("Maximum distance to neighbors");
  94. min_opt = G_define_option();
  95. min_opt->type = TYPE_INTEGER;
  96. min_opt->key = "min";
  97. min_opt->required = NO;
  98. min_opt->label = _("Minimum number of points to create a cluster");
  99. method_opt = G_define_option();
  100. method_opt->type = TYPE_STRING;
  101. method_opt->key = "method";
  102. method_opt->options = "dbscan,dbscan2,density,optics,optics2";
  103. method_opt->answer = "dbscan";
  104. method_opt->required = NO;
  105. method_opt->label = _("Clustering method");
  106. flag_2d = G_define_flag();
  107. flag_2d->key = '2';
  108. flag_2d->label = _("Force 2D clustering");
  109. flag_topo = G_define_standard_flag(G_FLG_V_TOPO);
  110. flag_attr = G_define_standard_flag(G_FLG_V_TABLE);
  111. /* options and flags parser */
  112. if (G_parser(argc, argv))
  113. exit(EXIT_FAILURE);
  114. Points = Vect_new_line_struct();
  115. Cats = Vect_new_cats_struct();
  116. Vect_check_input_output_name(input->answer, output->answer, G_FATAL_EXIT);
  117. if (Vect_set_open_level(1))
  118. G_fatal_error(_("Unable to set predetermined vector open level"));
  119. if (1 > Vect_open_old(&In, input->answer, NULL))
  120. G_fatal_error(_("Unable to open vector map <%s>"), input->answer);
  121. /* Check if old vector is 3D. We should preserve 3D data. */
  122. is3d = WITHOUT_Z;
  123. ndims = 2;
  124. if (Vect_is_3d(&In)) {
  125. is3d = WITH_Z;
  126. ndims = 3;
  127. }
  128. if (flag_2d->answer)
  129. ndims = 2;
  130. minpnts = ndims;
  131. if (min_opt->answer) {
  132. minpnts = atoi(min_opt->answer);
  133. if (minpnts < 2) {
  134. G_warning(_("Minimum number of points must be at least 2"));
  135. minpnts = 2;
  136. }
  137. minpnts--;
  138. }
  139. clayer = atoi(lyr_opt->answer);
  140. if (clayer < 1)
  141. G_fatal_error(_("Option %s must be positive"), lyr_opt->key);
  142. clmethod = CL_DBSCAN;
  143. if (!strcmp(method_opt->answer, "dbscan2"))
  144. clmethod = CL_DBSCAN2;
  145. else if (!strcmp(method_opt->answer, "density"))
  146. clmethod = CL_DENSE;
  147. else if (!strcmp(method_opt->answer, "optics"))
  148. clmethod = CL_OPTICS;
  149. else if (!strcmp(method_opt->answer, "optic2"))
  150. clmethod = CL_OPTICS2;
  151. /* count points */
  152. G_message(_("Counting input points ..."));
  153. npoints = nlines = 0;
  154. while ((type = Vect_read_next_line(&In, Points, Cats)) > 0) {
  155. nlines++;
  156. if (type == GV_POINT) {
  157. if (Vect_cat_get(Cats, clayer, &cat))
  158. G_fatal_error(_("Layer %d is not empty, choose another layer"),
  159. clayer);
  160. npoints++;
  161. }
  162. }
  163. if (npoints < minpnts + 1) {
  164. G_warning(_("Not enough points in input, nothing to do"));
  165. Vect_close(&In);
  166. exit(EXIT_SUCCESS);
  167. }
  168. /* Open new vector for reading/writing */
  169. if (0 > Vect_open_new(&Out, output->answer, is3d)) {
  170. Vect_close(&In);
  171. G_fatal_error(_("Unable to create vector map <%s>"), output->answer);
  172. }
  173. /* Copy header and history data from old to new map */
  174. Vect_copy_head_data(&In, &Out);
  175. Vect_hist_copy(&In, &Out);
  176. Vect_hist_command(&Out);
  177. /* create k-d tree */
  178. G_message(_("Creating search index ..."));
  179. kdt = kdtree_create(ndims, NULL);
  180. cid = G_malloc((nlines + 1) * sizeof(int));
  181. idx = G_malloc((nlines + 1) * sizeof(int));
  182. Vect_rewind(&In);
  183. i = 0;
  184. cid[0] = 0;
  185. idx[0] = 0;
  186. while ((type = Vect_read_next_line(&In, Points, Cats)) > 0) {
  187. G_percent(i++, nlines, 4);
  188. cid[i] = 0;
  189. if (type == GV_POINT) {
  190. c[0] = Points->x[0];
  191. c[1] = Points->y[0];
  192. c[2] = Points->z[0];
  193. kdtree_insert(kdt, c, i, 0);
  194. }
  195. }
  196. G_percent(nlines, nlines, 4);
  197. kdtree_optimize(kdt, 2);
  198. noutliers = nclusters = 0;
  199. if (clmethod == CL_DBSCAN) {
  200. /* DBSCAN
  201. * the neighbors of each point
  202. * with at least minpnts neighbors within distance (epsilon)
  203. * are added to a cluster */
  204. /* get epsilon */
  205. if (dist_opt->answer) {
  206. eps = atof(dist_opt->answer);
  207. if (eps <= 0)
  208. G_fatal_error(_("Option %s must be a positive number"), dist_opt->key);
  209. }
  210. else {
  211. int n;
  212. double dist, mean, min, max, sum, sumsq, sd;
  213. double *kd;
  214. int *ki;
  215. /* estimate epsilon */
  216. G_message(_("Estimating maximum distance ..."));
  217. kdtree_init_trav(&trav, kdt);
  218. c[2] = 0.0;
  219. n = 0;
  220. sum = sumsq = 0;
  221. min = 1.0 / 0.0;
  222. max = 0;
  223. kd = G_malloc(minpnts * sizeof(double));
  224. ki = G_malloc(minpnts * sizeof(int));
  225. i = 0;
  226. while (kdtree_traverse(&trav, c, &uid)) {
  227. G_percent(i++, npoints, 4);
  228. kdfound = kdtree_knn(kdt, c, ki, kd, minpnts, &uid);
  229. if (kdfound) {
  230. dist = sqrt(kd[kdfound - 1]);
  231. sum += dist;
  232. sumsq += dist * dist;
  233. n++;
  234. if (min > dist)
  235. min = dist;
  236. if (max < dist)
  237. max = dist;
  238. }
  239. }
  240. G_percent(npoints, npoints, 4);
  241. G_free(kd);
  242. G_free(ki);
  243. if (!n)
  244. G_fatal_error(_("No neighbors found"));
  245. mean = sum / n;
  246. sd = sqrt(sumsq / n - mean * mean);
  247. eps = mean + 1.644854 * sd; /* 90% CI */
  248. eps = mean + 2.575829 * sd; /* 99% CI */
  249. if (eps > max)
  250. eps = max;
  251. G_message(_("Distance to the %d nearest neighbor:"), minpnts);
  252. G_message(_("Min: %g, max: %g"), min, max);
  253. G_message(_("Mean: %g"), mean);
  254. G_message(_("Standard deviation: %g"), sd);
  255. G_message(_("Estimated maximum distance: %g"), eps);
  256. }
  257. /* create clusters */
  258. G_message(_("Building clusters ..."));
  259. nclusters = 0;
  260. kdtree_init_trav(&trav, kdt);
  261. c[2] = 0.0;
  262. idx[0] = 0;
  263. i = 0;
  264. while (kdtree_traverse(&trav, c, &uid)) {
  265. G_percent(i++, npoints, 4);
  266. /* radius search */
  267. /* TODO: use knn search */
  268. kdfound = kdtree_dnn(kdt, c, &kduid, &kddist, eps, &uid);
  269. /* must have min neighbors within radius */
  270. if (kdfound >= minpnts) {
  271. OLD = cid[uid];
  272. NEW = idx[OLD];
  273. while (OLD != NEW) {
  274. OLD = NEW;
  275. NEW = idx[OLD];
  276. }
  277. cat = NEW;
  278. /* find latest cluster */
  279. for (j = 0; j < kdfound; j++) {
  280. OLD = cid[kduid[j]];
  281. NEW = idx[OLD];
  282. while (OLD != NEW) {
  283. OLD = NEW;
  284. NEW = idx[OLD];
  285. }
  286. if (cat < NEW) {
  287. cat = NEW;
  288. }
  289. }
  290. if (cat == 0) {
  291. /* start new cluster */
  292. nclusters++;
  293. cat = nclusters;
  294. if (nclusters > nlines)
  295. G_fatal_error(_("nlines: %d, nclusters: %d"), nlines, nclusters);
  296. idx[nclusters] = nclusters;
  297. cid[uid] = nclusters;
  298. }
  299. /* set or update cluster ids */
  300. if (cid[uid] != 0) {
  301. /* relabel */
  302. OLD = cid[uid];
  303. NEW = idx[OLD];
  304. while (OLD != NEW) {
  305. OLD = NEW;
  306. NEW = idx[OLD];
  307. }
  308. idx[NEW] = cat;
  309. }
  310. else {
  311. cid[uid] = cat;
  312. }
  313. for (j = 0; j < kdfound; j++) {
  314. if (cid[kduid[j]] != 0) {
  315. /* relabel */
  316. OLD = cid[kduid[j]];
  317. NEW = idx[OLD];
  318. while (OLD != NEW) {
  319. OLD = NEW;
  320. NEW = idx[OLD];
  321. }
  322. idx[NEW] = cat;
  323. }
  324. else {
  325. cid[kduid[j]] = cat;
  326. }
  327. }
  328. }
  329. if (kdfound) {
  330. G_free(kddist);
  331. G_free(kduid);
  332. }
  333. }
  334. G_percent(npoints, npoints, 4);
  335. if (nclusters == 0) {
  336. G_message(_("No clusters found, adjust option %s"), dist_opt->key);
  337. Vect_close(&In);
  338. Vect_close(&Out);
  339. Vect_delete(output->answer);
  340. exit(EXIT_SUCCESS);
  341. }
  342. /* generate a renumbering scheme */
  343. G_message(_("Generating renumbering scheme..."));
  344. G_debug(1, "%d initial clusters", nclusters);
  345. /* allocate final clump ID */
  346. renumber = (int *) G_malloc((nclusters + 1) * sizeof(int));
  347. renumber[0] = 0;
  348. cat = 1;
  349. G_percent(0, nclusters, 1);
  350. for (i = 1; i <= nclusters; i++) {
  351. G_percent(i, nclusters, 4);
  352. OLD = i;
  353. NEW = idx[i];
  354. if (OLD != NEW) {
  355. renumber[i] = 0;
  356. /* find valid clump ID */
  357. while (OLD != NEW) {
  358. OLD = NEW;
  359. NEW = idx[OLD];
  360. }
  361. idx[i] = NEW;
  362. }
  363. else
  364. /* set final clump id */
  365. renumber[i] = cat++;
  366. }
  367. nclusters = cat - 1;
  368. /* write cluster ids */
  369. G_message(_("Write out cluster ids ..."));
  370. Vect_rewind(&In);
  371. i = 0;
  372. noutliers = 0;
  373. while ((type = Vect_read_next_line(&In, Points, Cats)) > 0) {
  374. G_percent(i++, nlines, 4);
  375. if (type == GV_POINT) {
  376. cat = renumber[idx[cid[i]]];
  377. if (!cat)
  378. noutliers++;
  379. Vect_cat_set(Cats, clayer, cat);
  380. Vect_write_line(&Out, GV_POINT, Points, Cats);
  381. }
  382. }
  383. G_percent(nlines, nlines, 4);
  384. }
  385. else if (clmethod == CL_DBSCAN2) {
  386. /* DBSCAN, but cluster size must be at least minpnts + 1 */
  387. int *clcnt;
  388. /* get epsilon */
  389. if (dist_opt->answer) {
  390. eps = atof(dist_opt->answer);
  391. if (eps <= 0)
  392. G_fatal_error(_("Option %s must be a positive number"), dist_opt->key);
  393. }
  394. else {
  395. int n;
  396. double dist, mean, min, max, sum, sumsq, sd;
  397. double *kd;
  398. int *ki;
  399. /* estimate epsilon */
  400. G_message(_("Estimating maximum distance ..."));
  401. kdtree_init_trav(&trav, kdt);
  402. c[2] = 0.0;
  403. n = 0;
  404. sum = sumsq = 0;
  405. min = 1.0 / 0.0;
  406. max = 0;
  407. kd = G_malloc(minpnts * sizeof(double));
  408. ki = G_malloc(minpnts * sizeof(int));
  409. i = 0;
  410. while (kdtree_traverse(&trav, c, &uid)) {
  411. G_percent(i++, npoints, 4);
  412. kdfound = kdtree_knn(kdt, c, ki, kd, minpnts, &uid);
  413. if (kdfound) {
  414. dist = sqrt(kd[kdfound - 1]);
  415. sum += dist;
  416. sumsq += dist * dist;
  417. n++;
  418. if (min > dist)
  419. min = dist;
  420. if (max < dist)
  421. max = dist;
  422. }
  423. }
  424. G_percent(npoints, npoints, 4);
  425. G_free(kd);
  426. G_free(ki);
  427. if (!n)
  428. G_fatal_error(_("No neighbors found"));
  429. mean = sum / n;
  430. sd = sqrt(sumsq / n - mean * mean);
  431. eps = mean + 1.644854 * sd; /* 90% CI */
  432. eps = mean + 2.575829 * sd; /* 99% CI */
  433. if (eps > max)
  434. eps = max;
  435. G_message(_("Distance to the %d nearest neighbor:"), minpnts);
  436. G_message(_("Min: %g, max: %g"), min, max);
  437. G_message(_("Mean: %g"), mean);
  438. G_message(_("Standard deviation: %g"), sd);
  439. G_message(_("Estimated maximum distance: %g"), eps);
  440. }
  441. /* create clusters */
  442. G_message(_("Building clusters ..."));
  443. clcnt = G_malloc((nlines + 1) * sizeof(int));
  444. for (i = 0; i <= nlines; i++)
  445. clcnt[i] = 0;
  446. nclusters = 0;
  447. kdtree_init_trav(&trav, kdt);
  448. c[2] = 0.0;
  449. idx[0] = 0;
  450. i = 0;
  451. while (kdtree_traverse(&trav, c, &uid)) {
  452. G_percent(i++, npoints, 4);
  453. /* radius search */
  454. /* TODO: use knn search */
  455. kdfound = kdtree_dnn(kdt, c, &kduid, &kddist, eps, &uid);
  456. /* any neighbor within radius */
  457. if (kdfound > 0) {
  458. OLD = cid[uid];
  459. NEW = idx[OLD];
  460. while (OLD != NEW) {
  461. OLD = NEW;
  462. NEW = idx[OLD];
  463. }
  464. cat = NEW;
  465. /* find latest cluster */
  466. for (j = 0; j < kdfound; j++) {
  467. OLD = cid[kduid[j]];
  468. NEW = idx[OLD];
  469. while (OLD != NEW) {
  470. OLD = NEW;
  471. NEW = idx[OLD];
  472. }
  473. if (cat < NEW) {
  474. cat = NEW;
  475. }
  476. }
  477. if (cat == 0) {
  478. /* start new cluster */
  479. nclusters++;
  480. cat = nclusters;
  481. if (nclusters > nlines)
  482. G_fatal_error(_("nlines: %d, nclusters: %d"), nlines, nclusters);
  483. idx[nclusters] = nclusters;
  484. cid[uid] = nclusters;
  485. clcnt[cat] = 1;
  486. }
  487. /* set or update cluster ids */
  488. if (cid[uid] != 0) {
  489. /* relabel */
  490. OLD = cid[uid];
  491. NEW = idx[OLD];
  492. while (OLD != NEW) {
  493. OLD = NEW;
  494. NEW = idx[OLD];
  495. }
  496. idx[NEW] = cat;
  497. }
  498. else {
  499. cid[uid] = cat;
  500. clcnt[cat]++;
  501. }
  502. for (j = 0; j < kdfound; j++) {
  503. if (cid[kduid[j]] != 0) {
  504. OLD = cid[kduid[j]];
  505. NEW = idx[OLD];
  506. while (OLD != NEW) {
  507. OLD = NEW;
  508. NEW = idx[OLD];
  509. }
  510. /* relabel */
  511. idx[NEW] = cat;
  512. }
  513. else {
  514. cid[kduid[j]] = cat;
  515. clcnt[cat]++;
  516. }
  517. }
  518. G_free(kddist);
  519. G_free(kduid);
  520. }
  521. }
  522. G_percent(npoints, npoints, 4);
  523. if (nclusters == 0) {
  524. G_message(_("No clusters found, adjust option %s"), dist_opt->key);
  525. Vect_close(&In);
  526. Vect_close(&Out);
  527. Vect_delete(output->answer);
  528. exit(EXIT_SUCCESS);
  529. }
  530. /* generate a renumbering scheme */
  531. G_message(_("Generating renumbering scheme..."));
  532. G_debug(1, "%d initial clusters", nclusters);
  533. /* allocate final clump ID */
  534. renumber = (int *) G_malloc((nclusters + 1) * sizeof(int));
  535. renumber[0] = 0;
  536. cat = 1;
  537. G_percent(0, nclusters, 1);
  538. for (i = 1; i <= nclusters; i++) {
  539. G_percent(i, nclusters, 4);
  540. OLD = i;
  541. NEW = idx[i];
  542. if (OLD != NEW) {
  543. /* find valid clump ID */
  544. while (OLD != NEW) {
  545. OLD = NEW;
  546. NEW = idx[OLD];
  547. }
  548. idx[i] = NEW;
  549. clcnt[NEW] += clcnt[i];
  550. }
  551. }
  552. for (i = 1; i <= nclusters; i++) {
  553. OLD = i;
  554. NEW = idx[i];
  555. renumber[i] = 0;
  556. if (OLD == NEW && clcnt[NEW] > minpnts) {
  557. /* set final clump id */
  558. renumber[i] = cat++;
  559. }
  560. }
  561. nclusters = cat - 1;
  562. /* write cluster ids */
  563. G_message(_("Write out cluster ids ..."));
  564. Vect_rewind(&In);
  565. i = 0;
  566. noutliers = 0;
  567. while ((type = Vect_read_next_line(&In, Points, Cats)) > 0) {
  568. G_percent(i++, nlines, 4);
  569. if (type == GV_POINT) {
  570. cat = renumber[idx[cid[i]]];
  571. if (!cat)
  572. noutliers++;
  573. Vect_cat_set(Cats, clayer, cat);
  574. Vect_write_line(&Out, GV_POINT, Points, Cats);
  575. }
  576. }
  577. G_percent(nlines, nlines, 4);
  578. }
  579. else if (clmethod == CL_OPTICS) {
  580. /* OPTICS
  581. * each pair of points is either directly connected or
  582. * connected by a chain of other points
  583. * for each unprocessed point p
  584. * mark as processed, append to output list
  585. * core distance of p: distance to the k-th neighbor
  586. * q: neighbor of p
  587. * reachability of q: max(dist(p, q), coredist(p))
  588. * -> needs epsilon, otherwise always coredist(p)
  589. * for each unprocessed neighbor q
  590. * if q has not been reached yet, put q in min heap
  591. * if q's reachability can be reduced, put q with new reachability in min heap
  592. * proceed with point with smallest reachability
  593. * clusters:
  594. * plot x = position in output list, y = reachability
  595. * clusters = valleys of reachability in plot
  596. * hierarchical clusters: valleys in valleys
  597. */
  598. double *kd;
  599. int *ki;
  600. int k, kdpnts;
  601. int *clidx;
  602. int *olist, nout;
  603. double newrd;
  604. int isout;
  605. kd = G_malloc(minpnts * sizeof(double));
  606. ki = G_malloc(minpnts * sizeof(int));
  607. clp = G_malloc((npoints + 1) * sizeof(struct cl_pnt));
  608. heapidx = G_malloc((npoints + 1) * sizeof(int));
  609. olist = G_malloc((npoints + 1) * sizeof(int));
  610. clidx = G_malloc((nlines + 1) * sizeof(int));
  611. heapsize = 0;
  612. /* get epsilon */
  613. eps = 0;
  614. if (dist_opt->answer) {
  615. eps = atof(dist_opt->answer);
  616. if (eps <= 0)
  617. G_fatal_error(_("Option %s must be a positive number"), dist_opt->key);
  618. }
  619. /* loading points */
  620. G_message(_("Loading points ..."));
  621. kdtree_init_trav(&trav, kdt);
  622. c[2] = 0.0;
  623. i = 0;
  624. while (kdtree_traverse(&trav, c, &uid)) {
  625. G_percent(i, npoints, 4);
  626. clp[i].c[0] = c[0];
  627. clp[i].c[1] = c[1];
  628. clp[i].c[2] = c[2];
  629. clp[i].uid = uid;
  630. clp[i].cd = -1;
  631. clp[i].reach = -1;
  632. clp[i].prevpnt = -1;
  633. clidx[uid] = i;
  634. olist[i] = -1;
  635. i++;
  636. }
  637. G_percent(npoints, npoints, 4);
  638. kdpnts = i;
  639. G_debug(0, "%d points in k-d tree", kdpnts);
  640. /* reachability network */
  641. G_message(_("Reachability network ..."));
  642. nout = 0;
  643. for (i = 0; i < kdpnts; i++) {
  644. G_percent(i, kdpnts, 4);
  645. if (clp[i].cd > 0)
  646. continue;
  647. /* knn search */
  648. uid = clp[i].uid;
  649. kdfound = kdtree_knn(kdt, clp[i].c, ki, kd, minpnts, &uid);
  650. if (kdfound < minpnts)
  651. G_fatal_error(_("Not enough points found"));
  652. clp[i].cd = kd[minpnts - 1];
  653. /* no reachability for the seed point !!! */
  654. clp[i].reach = clp[i].cd; /* ok ? */
  655. olist[nout++] = i;
  656. /* initialize heap */
  657. newrd = clp[i].cd;
  658. for (j = 0; j < kdfound; j++) {
  659. if (clp[clidx[ki[j]]].cd < 0) {
  660. /* deviation from OPTICS,
  661. * creates nicer connectivity graph */
  662. newrd = kd[j];
  663. if (clp[clidx[ki[j]]].reach < 0 || clp[clidx[ki[j]]].reach > newrd) {
  664. clp[clidx[ki[j]]].reach = newrd;
  665. clp[clidx[ki[j]]].prevpnt = i;
  666. add_pt(clidx[ki[j]]);
  667. }
  668. }
  669. }
  670. while (heapsize) {
  671. k = drop_pt();
  672. if (k < 0 || k >= kdpnts)
  673. G_fatal_error("Invalid index");
  674. if (clp[k].cd > 0)
  675. continue;
  676. /* knn search */
  677. uid = clp[k].uid;
  678. kdfound = kdtree_knn(kdt, clp[k].c, ki, kd, minpnts, &uid);
  679. if (kdfound < minpnts)
  680. G_fatal_error(_("Not enough points found"));
  681. clp[k].cd = kd[minpnts - 1];
  682. olist[nout++] = k;
  683. newrd = clp[k].cd;
  684. for (j = 0; j < kdfound; j++) {
  685. if (heapsize >= npoints)
  686. G_fatal_error("Heap is too large");
  687. if (clp[clidx[ki[j]]].cd < 0) {
  688. /* deviation from OPTICS,
  689. * creates nicer connectivity graph */
  690. newrd = kd[j];
  691. if (clp[clidx[ki[j]]].reach < 0 || clp[clidx[ki[j]]].reach > newrd) {
  692. clp[clidx[ki[j]]].reach = newrd;
  693. clp[clidx[ki[j]]].prevpnt = k;
  694. add_pt(clidx[ki[j]]);
  695. }
  696. }
  697. }
  698. }
  699. }
  700. G_percent(kdpnts, kdpnts, 4);
  701. G_debug(0, "nout: %d", nout);
  702. if (nout != kdpnts)
  703. G_fatal_error("nout != kdpnts");
  704. /* set cluster ids */
  705. G_message(_("Set cluster ids ..."));
  706. isout = 1;
  707. nclusters = 0;
  708. for (i = 0; i < kdpnts; i++) {
  709. G_percent(i, kdpnts, 4);
  710. if (eps > 0 && clp[olist[i]].reach > eps)
  711. isout = 1;
  712. else {
  713. if (isout || clp[olist[i]].prevpnt == -1) {
  714. isout = 0;
  715. nclusters++;
  716. }
  717. cid[clp[olist[i]].uid] = nclusters;
  718. }
  719. }
  720. G_percent(kdpnts, kdpnts, 4);
  721. /* write cluster ids */
  722. G_message(_("Write out cluster ids ..."));
  723. Vect_rewind(&In);
  724. i = 0;
  725. noutliers = 0;
  726. while ((type = Vect_read_next_line(&In, Points, Cats)) > 0) {
  727. G_percent(i++, nlines, 4);
  728. if (type == GV_POINT) {
  729. cat = cid[i];
  730. if (!cat)
  731. noutliers++;
  732. Vect_cat_set(Cats, clayer, cat);
  733. Vect_write_line(&Out, GV_POINT, Points, Cats);
  734. }
  735. }
  736. G_percent(nlines, nlines, 4);
  737. }
  738. else if (clmethod == CL_OPTICS2) {
  739. /* OPTICS modified, create separated reachability networks
  740. * for each point p
  741. * get p's core distance
  742. * get p's neighbors
  743. * for each neighbor q
  744. * new reachability of q: dist(p, q)
  745. * if q has been processed
  746. * new reachability of q: max(coredist(q), dist(p, q))
  747. * set or reduce q's reachability
  748. * connect q to p if q's reachability can be updated
  749. */
  750. double *coredist;
  751. double *reachability;
  752. int *nextpnt;
  753. double *kd;
  754. int *ki;
  755. double newrd;
  756. coredist = G_malloc((nlines + 1) * sizeof(double));
  757. reachability = G_malloc((nlines + 1) * sizeof(double));
  758. nextpnt = G_malloc((nlines + 1) * sizeof(int));
  759. kd = G_malloc(minpnts * sizeof(double));
  760. ki = G_malloc(minpnts * sizeof(int));
  761. for (i = 0; i <= nlines; i++) {
  762. coredist[i] = -1;
  763. reachability[i] = -1;
  764. nextpnt[i] = -1;
  765. cid[i] = 0;
  766. idx[i] = 0;
  767. }
  768. /* reachability network */
  769. G_message(_("Reachability network ..."));
  770. kdtree_init_trav(&trav, kdt);
  771. c[2] = 0.0;
  772. i = 0;
  773. while (kdtree_traverse(&trav, c, &uid)) {
  774. G_percent(i++, npoints, 4);
  775. /* knn search */
  776. kdfound = kdtree_knn(kdt, c, ki, kd, minpnts, &uid);
  777. if (kdfound < minpnts)
  778. G_fatal_error(_("Not enough points found"));
  779. coredist[uid] = kd[minpnts - 1];
  780. for (j = 0; j < kdfound; j++) {
  781. /* new reachability */
  782. newrd = kd[j];
  783. if (coredist[ki[j]] > kd[j]) {
  784. /* do not connect a point to its own cluster
  785. * because points in its own cluster
  786. * have already been connected to this point
  787. * or reconnected */
  788. newrd = coredist[ki[j]];
  789. }
  790. if (reachability[ki[j]] == -1 || reachability[ki[j]] > newrd) {
  791. reachability[ki[j]] = newrd;
  792. nextpnt[ki[j]] = uid;
  793. /* no link - back link */
  794. if (nextpnt[uid] == ki[j]) {
  795. if (coredist[ki[j]] == -1) {
  796. G_fatal_error(_("Neighbor point's core dist is -1"));
  797. }
  798. if (coredist[ki[j]] < coredist[uid]) {
  799. nextpnt[ki[j]] = -1;
  800. reachability[ki[j]] = -1;
  801. nextpnt[uid] = ki[j];
  802. }
  803. else {
  804. nextpnt[uid] = -1;
  805. reachability[uid] = -1;
  806. reachability[uid] = -1;
  807. }
  808. }
  809. }
  810. }
  811. }
  812. G_percent(npoints, npoints, 4);
  813. /* create clusters from reachability network */
  814. G_message(_("Building clusters ..."));
  815. G_percent(0, nlines, 4);
  816. for (i = 1; i <= nlines; i++) {
  817. G_percent(i, nlines, 4);
  818. if (cid[i] > 0 || coredist[i] == -1 || nextpnt[i] == -1)
  819. continue;
  820. if (cid[nextpnt[i]] > 0) {
  821. cid[i] = idx[cid[nextpnt[i]]];
  822. }
  823. else {
  824. /* start new cluster */
  825. nclusters++;
  826. cid[i] = nclusters;
  827. idx[nclusters] = nclusters;
  828. uid = nextpnt[i];
  829. while (uid > 0) {
  830. if (cid[uid] == 0) {
  831. cid[uid] = nclusters;
  832. uid = nextpnt[uid];
  833. }
  834. else {
  835. /* relabel */
  836. OLD = cid[uid];
  837. NEW = idx[OLD];
  838. while (OLD != NEW) {
  839. OLD = NEW;
  840. NEW = idx[OLD];
  841. }
  842. idx[NEW] = nclusters;
  843. uid = nextpnt[uid];
  844. uid = -1;
  845. }
  846. }
  847. }
  848. }
  849. /* generate a renumbering scheme */
  850. G_message(_("Generating renumbering scheme..."));
  851. G_debug(1, "%d initial clusters", nclusters);
  852. /* allocate final clump ID */
  853. renumber = (int *) G_malloc((nclusters + 1) * sizeof(int));
  854. renumber[0] = 0;
  855. cat = 1;
  856. G_percent(0, nclusters, 1);
  857. for (i = 1; i <= nclusters; i++) {
  858. G_percent(i, nclusters, 4);
  859. OLD = i;
  860. NEW = idx[i];
  861. if (OLD != NEW) {
  862. renumber[i] = 0;
  863. /* find valid clump ID */
  864. while (OLD != NEW) {
  865. OLD = NEW;
  866. NEW = idx[OLD];
  867. }
  868. idx[i] = NEW;
  869. }
  870. else
  871. /* set final clump id */
  872. renumber[i] = cat++;
  873. }
  874. nclusters = cat - 1;
  875. /* write cluster ids */
  876. G_message(_("Write out cluster ids ..."));
  877. Vect_rewind(&In);
  878. i = 0;
  879. noutliers = 0;
  880. while ((type = Vect_read_next_line(&In, Points, Cats)) > 0) {
  881. G_percent(i++, nlines, 4);
  882. if (type == GV_POINT) {
  883. cat = renumber[idx[cid[i]]];
  884. if (!cat)
  885. noutliers++;
  886. Vect_cat_set(Cats, clayer, cat);
  887. Vect_write_line(&Out, GV_POINT, Points, Cats);
  888. }
  889. }
  890. G_percent(nlines, nlines, 4);
  891. }
  892. else if (clmethod == CL_DENSE) {
  893. /* MATRUSKA
  894. * clusters in clusters in clusters ...
  895. * calculate core density = distance to (minpnts - 1) for each point
  896. * sort points ascending by core density
  897. * for each point in sorted list
  898. * if point does not have a cluster id
  899. * start new cluster, cluster reachability is core density of this point
  900. * connect all points within cluster reachability
  901. * add all connected points to list
  902. * while list is not empty
  903. * remove last point from list
  904. * connect all points within cluster reachability
  905. * add all connected points to list
  906. * */
  907. int *clidx;
  908. double *kd;
  909. int *ki;
  910. double cd;
  911. int k, kdcount;
  912. struct ilist *CList;
  913. clp = G_malloc((nlines + 1) * sizeof(struct cl_pnt));
  914. clidx = G_malloc((nlines + 1) * sizeof(int));
  915. kd = G_malloc(minpnts * sizeof(double));
  916. ki = G_malloc(minpnts * sizeof(int));
  917. CList = G_new_ilist();
  918. for (i = 0; i <= nlines; i++) {
  919. clp[i].c[0] = 0;
  920. clp[i].c[1] = 0;
  921. clp[i].c[2] = 0;
  922. clp[i].cd = -1;
  923. clp[i].uid = -1;
  924. clidx[i] = -1;
  925. }
  926. /* core density */
  927. G_message(_("Core density ..."));
  928. kdtree_init_trav(&trav, kdt);
  929. c[2] = 0.0;
  930. i = 0;
  931. kdcount = 0;
  932. uid = -1;
  933. while (kdtree_traverse(&trav, c, &uid)) {
  934. G_percent(i++, npoints, 4);
  935. /* knn search */
  936. kdfound = kdtree_knn(kdt, c, ki, kd, minpnts, &uid);
  937. if (kdfound < minpnts)
  938. G_fatal_error(_("Not enough points found"));
  939. cd = kd[minpnts - 1];
  940. /* list insert */
  941. for (j = kdcount; j > 0; j--) {
  942. if (clp[j - 1].cd <= cd)
  943. break;
  944. clp[j] = clp[j - 1];
  945. clidx[clp[j].uid] = j;
  946. }
  947. clp[j].uid = uid;
  948. clp[j].c[0] = c[0];
  949. clp[j].c[1] = c[1];
  950. clp[j].c[2] = c[2];
  951. clp[j].cd = cd;
  952. clidx[clp[j].uid] = j;
  953. kdcount++;
  954. }
  955. G_percent(npoints, npoints, 4);
  956. /* create clusters */
  957. G_message(_("Building clusters ..."));
  958. nclusters = 0;
  959. for (i = 0; i < kdcount; i++) {
  960. G_percent(i, kdcount, 4);
  961. if (cid[clp[i].uid] > 0)
  962. continue;
  963. /* knn search */
  964. kdfound = kdtree_knn(kdt, clp[i].c, ki, kd, minpnts, &clp[i].uid);
  965. if (kdfound < minpnts)
  966. G_fatal_error(_("Not enough points found"));
  967. /* start a new cluster */
  968. uid = clp[i].uid;
  969. nclusters++;
  970. cat = nclusters;
  971. cid[uid] = cat;
  972. cd = clp[i].cd;
  973. CList->n_values = 0;
  974. for (j = 0; j < kdfound; j++) {
  975. if (cid[ki[j]] == 0) {
  976. G_ilist_add(CList, clidx[ki[j]]);
  977. cid[ki[j]] = cat;
  978. }
  979. }
  980. if (CList->n_values < minpnts) {
  981. CList->n_values = 0;
  982. nclusters--;
  983. cid[uid] = 0;
  984. for (j = 0; j < kdfound; j++) {
  985. if (cid[ki[j]] == cat) {
  986. cid[ki[j]] = 0;
  987. }
  988. }
  989. }
  990. while (CList->n_values) {
  991. /* expand cluster */
  992. CList->n_values--;
  993. k = CList->value[CList->n_values];
  994. if (k < 0)
  995. G_fatal_error("expand cluster: k < 0");
  996. if (clp[k].uid < 1)
  997. G_fatal_error("expand cluster: clp[k].uid < 1");
  998. kdfound = kdtree_knn(kdt, clp[k].c, ki, kd, minpnts, &clp[k].uid);
  999. if (kdfound < minpnts)
  1000. G_fatal_error(_("Not enough points found"));
  1001. for (j = 0; j < kdfound; j++) {
  1002. if (kd[j] <= cd && cid[ki[j]] == 0) {
  1003. cid[ki[j]] = cat;
  1004. if (clidx[ki[j]] < 0)
  1005. G_fatal_error("expand cluster ngbrs: clidx[ki[j]] < 0");
  1006. G_ilist_add(CList, clidx[ki[j]]);
  1007. }
  1008. }
  1009. }
  1010. }
  1011. G_percent(kdcount, kdcount, 4);
  1012. /* write cluster ids */
  1013. G_message(_("Write out cluster ids ..."));
  1014. Vect_rewind(&In);
  1015. i = 0;
  1016. noutliers = 0;
  1017. while ((type = Vect_read_next_line(&In, Points, Cats)) > 0) {
  1018. G_percent(i++, nlines, 4);
  1019. if (type == GV_POINT) {
  1020. cat = cid[i];
  1021. if (!cat)
  1022. noutliers++;
  1023. Vect_cat_set(Cats, clayer, cat);
  1024. Vect_write_line(&Out, GV_POINT, Points, Cats);
  1025. }
  1026. }
  1027. G_percent(nlines, nlines, 4);
  1028. }
  1029. if (!flag_attr->answer)
  1030. Vect_copy_tables(&In, &Out, 0);
  1031. /* Build topology for vector map and close them */
  1032. Vect_close(&In);
  1033. if (!flag_topo->answer)
  1034. Vect_build(&Out);
  1035. Vect_close(&Out);
  1036. G_message(_("%d clusters found"), nclusters);
  1037. G_message(_("%d outliers found"), noutliers);
  1038. exit(EXIT_SUCCESS);
  1039. }
  1040. /* min heap */
  1041. /* compare heap points */
  1042. int cmp_pnt(int a, int b)
  1043. {
  1044. if (clp[a].reach < clp[b].reach)
  1045. return 1;
  1046. if (clp[a].reach > clp[b].reach)
  1047. return 0;
  1048. if (clp[a].uid < clp[b].uid)
  1049. return 1;
  1050. return 0;
  1051. }
  1052. /* standard sift-up routine for d-ary min heap */
  1053. static int sift_up(int start)
  1054. {
  1055. register int parent, child, child_added;
  1056. child = start;
  1057. child_added = heapidx[child];
  1058. while (child > 1) {
  1059. GET_PARENT(parent, child);
  1060. /* child smaller */
  1061. if (cmp_pnt(child_added, heapidx[parent])) {
  1062. /* push parent point down */
  1063. heapidx[child] = heapidx[parent];
  1064. child = parent;
  1065. }
  1066. else
  1067. /* no more sifting up, found new slot for child */
  1068. break;
  1069. }
  1070. /* put point in new slot */
  1071. if (child < start) {
  1072. heapidx[child] = child_added;
  1073. }
  1074. return 0;
  1075. }
  1076. /* add point routine for min heap */
  1077. int add_pt(int idx)
  1078. {
  1079. /* add point to next free position */
  1080. heapsize++;
  1081. heapidx[heapsize] = idx;
  1082. /* sift up: move new point towards top of heap */
  1083. sift_up(heapsize);
  1084. return 0;
  1085. }
  1086. /* drop point routine for min heap */
  1087. int drop_pt(void)
  1088. {
  1089. register int child, childr, parent;
  1090. register int i, idx;
  1091. idx = heapidx[1];
  1092. if (heapsize == 1) {
  1093. heapidx[1] = -1;
  1094. heapsize = 0;
  1095. return idx;
  1096. }
  1097. /* start with root */
  1098. parent = 1;
  1099. /* sift down: move hole back towards bottom of heap */
  1100. while (GET_CHILD(child, parent) <= heapsize) {
  1101. i = child + 3;
  1102. for (childr = child + 1; childr <= heapsize && childr < i; childr++) {
  1103. if (cmp_pnt(heapidx[childr], heapidx[child])) {
  1104. child = childr;
  1105. }
  1106. }
  1107. /* move hole down */
  1108. heapidx[parent] = heapidx[child];
  1109. parent = child;
  1110. }
  1111. /* hole is in lowest layer, move to heap end */
  1112. if (parent < heapsize) {
  1113. heapidx[parent] = heapidx[heapsize];
  1114. /* sift up last swapped point, only necessary if hole moved to heap end */
  1115. sift_up(parent);
  1116. }
  1117. /* the actual drop */
  1118. heapsize--;
  1119. return idx;
  1120. }