Benford.ecl 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329
  1. /**
  2. * Benford's law, also called the Newcomb–Benford law, or the law of anomalous
  3. * numbers, is an observation about the frequency distribution of leading
  4. * digits in many real-life sets of numerical data.
  5. *
  6. * Benford's law doesn't apply to every set of numbers, but it usually applies
  7. * to large sets of naturally occurring numbers with some connection like:
  8. *
  9. * Companies' stock market values
  10. * Data found in texts — like the Reader's Digest, or a copy of Newsweek
  11. * Demographic data, including state and city populations
  12. * Income tax data
  13. * Mathematical tables, like logarithms
  14. * River drainage rates
  15. * Scientific data
  16. *
  17. * The law usually doesn't apply to data sets that have a stated minimum and
  18. * maximum, like interest rates or hourly wages. If numbers are assigned,
  19. * rather than naturally occurring, they will also not follow the law. Examples
  20. * of assigned numbers include: zip codes, telephone numbers and Social
  21. * Security numbers.
  22. *
  23. * For more information: https://en.wikipedia.org/wiki/Benford%27s_law
  24. *
  25. * This function computes the distribution of digits within one or more
  26. * attributes in a dataset and displays the result, one attribute per row,
  27. * with an "expected" row showing the expected distributions. Included
  28. * in each data row is a chi-squared computation for that row indicating how
  29. * well the computed result matches the expected result (if the chi-squared
  30. * value exceeds the one shown in the --EXPECTED-- row then the data row
  31. * DOES NOT follow Benford's Law).
  32. *
  33. * Note that when computing the distribution of the most significant digit,
  34. * the digit zero is ignored. So for instance, the values 0100, 100, 1.0,
  35. * 0.10, and 0.00001 all have a most-significant digit of '1'. The digit
  36. * zero is considered for all other positions.
  37. *
  38. * @param inFile The dataset to process; REQUIRED
  39. * @param fieldListStr A string containing a comma-delimited list of
  40. * attribute names to process; note that attributes
  41. * listed here must be top-level attributes (not child
  42. * records or child datasets); use an empty string to
  43. * process all top-level attributes in inFile;
  44. * OPTIONAL, defaults to an empty string
  45. * @param digit The 1-based digit within the number to examine; the
  46. * first significant digit is '1' and it only increases;
  47. * OPTIONAL, defaults to 1, meaning the most-significant
  48. * non-zero digit
  49. * @param sampleSize A positive integer representing a percentage of
  50. * inFile to examine, which is useful when analyzing a
  51. * very large dataset and only an estimated data
  52. * analysis is sufficient; valid range for this
  53. * argument is 1-100; values outside of this range
  54. * will be clamped; OPTIONAL, defaults to 100 (which
  55. * indicates that all rows in the dataset will be used)
  56. *
  57. * @return A new dataset with the following record structure:
  58. *
  59. * RECORD
  60. * STRING attribute; // Name of data attribute examined
  61. * DECIMAL4_1 zero; // Percentage of rows with digit of '0'
  62. * DECIMAL4_1 one; // Percentage of rows with digit of '1'
  63. * DECIMAL4_1 two; // Percentage of rows with digit of '2'
  64. * DECIMAL4_1 three; // Percentage of rows with digit of '3'
  65. * DECIMAL4_1 four; // Percentage of rows with digit of '4'
  66. * DECIMAL4_1 five; // Percentage of rows with digit of '5'
  67. * DECIMAL4_1 six; // Percentage of rows with digit of '6'
  68. * DECIMAL4_1 seven; // Percentage of rows with digit of '7'
  69. * DECIMAL4_1 eight; // Percentage of rows with digit of '8'
  70. * DECIMAL4_1 nine; // Percentage of rows with digit of '9'
  71. * DECIMAL7_3 chi_squared; // Chi-squared "fitness test" result
  72. * UNSIGNED8 num_values; // Number of rows with non-zero values for this attribute
  73. * END;
  74. *
  75. * The named digit fields (e.g. "zero" and "one" and so on) represent the
  76. * digit found in the 'digit' position of the associated attribute. The values
  77. * that appear there are percentages. num_values shows the number of
  78. * non-zero values processed, and chi_squared shows the result of applying
  79. * that test using the observed vs expected distribution values.
  80. *
  81. * The first row of the results will show the expected values for the named
  82. * digits, with "-- EXPECTED DIGIT n --" showing as the attribute name.'n' will
  83. * be replaced with the value of 'digit' which indicates which digit position
  84. * was examined.
  85. *
  86. * Note that when viewing the results for the mosts significant digit (digit = 1),
  87. * the 'zero' field will show a -1 value, indicating that it was ignored.
  88. */
  89. EXPORT Benford(inFile, fieldListStr = '\'\'', digit = 1, sampleSize = 100) := FUNCTIONMACRO
  90. #UNIQUENAME(minDigit);
  91. LOCAL %minDigit% := MAX((INTEGER)digit, 1);
  92. #UNIQUENAME(clampedDigit);
  93. LOCAL %clampedDigit% := MIN(%minDigit%, 4);
  94. // Chi-squared critical value table:
  95. // https://www.itl.nist.gov/div898/handbook/eda/section3/eda3674.htm
  96. // Chi-squared critical values for 8 degrees of freedom at various probabilities
  97. // Probability: 0.90 0.95 0.975 0.99 0.999
  98. // Critical value: 13.362 15.507 17.535 20.090 26.125
  99. #UNIQUENAME(CHI_SQUARED_CRITICAL_VALUE_1);
  100. #SET(CHI_SQUARED_CRITICAL_VALUE_1, 20.090); // 99% probability
  101. // Chi-squared critical values for 9 degrees of freedom at various probabilities
  102. // Probability: 0.90 0.95 0.975 0.99 0.999
  103. // Critical value: 14.684 16.919 19.023 21.666 27.877
  104. #UNIQUENAME(CHI_SQUARED_CRITICAL_VALUE_2);
  105. #SET(CHI_SQUARED_CRITICAL_VALUE_2, 21.666); // 99% probability
  106. #UNIQUENAME(CHI_SQUARED_CRITICAL_VALUE);
  107. LOCAL %CHI_SQUARED_CRITICAL_VALUE% := IF(%clampedDigit% = 1, %CHI_SQUARED_CRITICAL_VALUE_1%, %CHI_SQUARED_CRITICAL_VALUE_2%);
  108. #UNIQUENAME(expectedDistribution);
  109. LOCAL %expectedDistribution% := DATASET
  110. (
  111. [
  112. {1, -1, 30.1, 17.6, 12.5, 9.7, 7.9, 6.7, 5.8, 5.1, 4.6},
  113. {2, 12.0, 11.4, 10.9, 10.4, 10.0, 9.7, 9.3, 9.0, 8.8, 8.5},
  114. {3, 10.2, 10.1, 10.1, 10.1, 10.0, 10.0, 9.9, 9.9, 9.9, 9.8},
  115. {4, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0}
  116. ],
  117. {
  118. UNSIGNED1 pos,
  119. DECIMAL4_1 zero,
  120. DECIMAL4_1 one,
  121. DECIMAL4_1 two,
  122. DECIMAL4_1 three,
  123. DECIMAL4_1 four,
  124. DECIMAL4_1 five,
  125. DECIMAL4_1 six,
  126. DECIMAL4_1 seven,
  127. DECIMAL4_1 eight,
  128. DECIMAL4_1 nine
  129. }
  130. );
  131. // Remove all spaces from field list so we can parse it more easily
  132. #UNIQUENAME(trimmedFieldList);
  133. LOCAL %trimmedFieldList% := TRIM((STRING)fieldListStr, ALL);
  134. // Ungroup the given dataset, in case it was grouped
  135. #UNIQUENAME(ungroupedInFile);
  136. LOCAL %ungroupedInFile% := UNGROUP(inFile);
  137. // Clamp the sample size to something reasonable
  138. #UNIQUENAME(clampedSampleSize);
  139. LOCAL %clampedSampleSize% := MAX(1, MIN(100, (INTEGER)sampleSize));
  140. // Create a sample dataset if needed
  141. #UNIQUENAME(sampledData);
  142. LOCAL %sampledData% := IF
  143. (
  144. %clampedSampleSize% < 100,
  145. ENTH(%ungroupedInFile%, %clampedSampleSize%, 100, 1, LOCAL),
  146. %ungroupedInFile%
  147. );
  148. // Slim the dataset if the caller provided an explicit set of attributes;
  149. // note that the TABLE function will fail if %trimmedFieldList% cites an
  150. // attribute that is a child dataset (this is an ECL limitation)
  151. #UNIQUENAME(workingInFile);
  152. LOCAL %workingInFile% :=
  153. #IF(%trimmedFieldList% = '')
  154. %sampledData%
  155. #ELSE
  156. TABLE(%sampledData%, {#EXPAND(%trimmedFieldList%)})
  157. #END;
  158. // Helper function that returns the 'pos' significant digit in a string;
  159. // if pos = 1 then th digit must be non-zero; returns 10
  160. // (an invalid *digit*) if no suitable digit is found
  161. #UNIQUENAME(NthDigit);
  162. LOCAL UNSIGNED1 %NthDigit%(STRING s, UNSIGNED1 pos) := EMBED(C++)
  163. #option pure
  164. unsigned char foundDigit = 10; // impossible value
  165. int digitsFound = 0;
  166. for (unsigned int x = 0; x < lenS; x++)
  167. {
  168. char ch = s[x];
  169. if (isdigit(ch) && (digitsFound > 0 || ch != '0'))
  170. {
  171. ++digitsFound;
  172. if (digitsFound >= pos)
  173. {
  174. foundDigit = ch - '0';
  175. break;
  176. }
  177. // Once we find a significant digit, the default return value
  178. // is a trailing zero (assumed after an implied decimal point
  179. // if we're parsing an integer)
  180. foundDigit = 0;
  181. }
  182. else if (ch == '.')
  183. {
  184. // Once we find a decimal point, the default return value
  185. // is a trailing zero
  186. foundDigit = 0;
  187. }
  188. }
  189. return foundDigit;
  190. ENDEMBED;
  191. // Temp field name we will use to ensure proper ordering of results
  192. #UNIQUENAME(idField);
  193. // One-record dataset containing expected Benford results, per-digit
  194. #UNIQUENAME(expectedDS);
  195. LOCAL %expectedDS% := PROJECT
  196. (
  197. %expectedDistribution%(pos = %clampedDigit%),
  198. TRANSFORM
  199. (
  200. {
  201. UNSIGNED2 %idField%,
  202. RECORDOF(LEFT) - [pos],
  203. DECIMAL7_3 chi_squared,
  204. UNSIGNED8 num_values,
  205. STRING attribute // Put this at the end for now, because it is variable-length
  206. },
  207. SELF.%idField% := 0,
  208. SELF.chi_squared := 0,
  209. SELF.num_values := COUNT(%workingInFile%),
  210. SELF.attribute := '-- EXPECTED DIGIT ' + (STRING)%minDigit% + ' --',
  211. SELF := LEFT
  212. )
  213. );
  214. // This will be used later as a datatype in a function signature
  215. #UNIQUENAME(DataRec);
  216. LOCAL %DataRec% := RECORDOF(%expectedDS%);
  217. // Get the internal representation of our working dataset
  218. #EXPORTXML(inFileFields, RECORDOF(%workingInFile%));
  219. // Create a dataset composed of the expectedDS and a row for each
  220. // field we will be processing
  221. #UNIQUENAME(interimResult);
  222. LOCAL %interimResult% := %expectedDS%
  223. #UNIQUENAME(recLevel)
  224. #SET(recLevel, 0)
  225. #UNIQUENAME(fieldNum)
  226. #SET(fieldNum, 0)
  227. #FOR(inFileFields)
  228. #FOR(Field)
  229. #IF(%{@isRecord}% = 1 OR %{@isDataset}% = 1)
  230. #SET(recLevel, %recLevel% + 1)
  231. #ELSEIF(%{@isEnd}% = 1)
  232. #SET(recLevel, %recLevel% - 1)
  233. #ELSEIF(%recLevel% = 0)
  234. #SET(fieldNum, %fieldNum% + 1)
  235. + TABLE
  236. (
  237. PROJECT(%workingInFile%, TRANSFORM({UNSIGNED1 n}, SELF.n := %NthDigit%((STRING)LEFT.%@name%, %minDigit%)))(n != 10),
  238. {
  239. UNSIGNED2 %idField% := %fieldNum%,
  240. DECIMAL4_1 zero := IF(%minDigit% = 1, -1.0, COUNT(GROUP, n = 0) / COUNT(GROUP) * 100),
  241. DECIMAL4_1 one := COUNT(GROUP, n = 1) / COUNT(GROUP) * 100,
  242. DECIMAL4_1 two := COUNT(GROUP, n = 2) / COUNT(GROUP) * 100,
  243. DECIMAL4_1 three := COUNT(GROUP, n = 3) / COUNT(GROUP) * 100,
  244. DECIMAL4_1 four := COUNT(GROUP, n = 4) / COUNT(GROUP) * 100,
  245. DECIMAL4_1 five := COUNT(GROUP, n = 5) / COUNT(GROUP) * 100,
  246. DECIMAL4_1 six := COUNT(GROUP, n = 6) / COUNT(GROUP) * 100,
  247. DECIMAL4_1 seven := COUNT(GROUP, n = 7) / COUNT(GROUP) * 100,
  248. DECIMAL4_1 eight := COUNT(GROUP, n = 8) / COUNT(GROUP) * 100,
  249. DECIMAL4_1 nine := COUNT(GROUP, n = 9) / COUNT(GROUP) * 100,
  250. DECIMAL7_3 chi_squared := 0, // Fill in later
  251. UNSIGNED8 num_values := COUNT(GROUP),
  252. STRING attribute := %'@name'%
  253. },
  254. MERGE
  255. )
  256. #END
  257. #END
  258. #END;
  259. // Helper function for computing chi-squared values from the interim results
  260. #UNIQUENAME(ComputeChiSquared);
  261. LOCAL %ComputeChiSquared%(%DataRec% expected, %DataRec% actual, UNSIGNED1 pos) := FUNCTION
  262. Term(DECIMAL4_1 e, DECIMAL4_1 o) := (((o - e) * (o - e)) / e);
  263. RETURN Term(expected.one, actual.one)
  264. + Term(expected.two, actual.two)
  265. + Term(expected.three, actual.three)
  266. + Term(expected.four, actual.four)
  267. + Term(expected.five, actual.five)
  268. + Term(expected.six, actual.six)
  269. + Term(expected.seven, actual.seven)
  270. + Term(expected.eight, actual.eight)
  271. + Term(expected.nine, actual.nine)
  272. + IF(pos = 1, 0, Term(expected.zero, actual.zero));
  273. END;
  274. // Insert the chi-squared results
  275. #UNIQUENAME(chiSquaredResult);
  276. LOCAL %chiSquaredResult% := PROJECT
  277. (
  278. %interimResult%,
  279. TRANSFORM
  280. (
  281. RECORDOF(LEFT),
  282. SELF.chi_squared := IF(LEFT.%idField% > 0, %ComputeChiSquared%(%expectedDS%[1], LEFT, %clampedDigit%), %CHI_SQUARED_CRITICAL_VALUE%),
  283. SELF := LEFT
  284. )
  285. );
  286. // Rewrite the result into our final format; changes:
  287. // - Sort by ID field to put rows into the proper order
  288. // - Remove the ID field
  289. // - Make the attribute field first
  290. #UNIQUENAME(finalResult);
  291. LOCAL %finalResult% := PROJECT
  292. (
  293. SORT(%chiSquaredResult%, %idField%),
  294. {
  295. STRING attribute,
  296. RECORDOF(%chiSquaredResult%) - [%idField%, attribute]
  297. }
  298. );
  299. RETURN %finalResult%;
  300. ENDMACRO;