123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329 |
- /**
- * Benford's law, also called the Newcomb–Benford law, or the law of anomalous
- * numbers, is an observation about the frequency distribution of leading
- * digits in many real-life sets of numerical data.
- *
- * Benford's law doesn't apply to every set of numbers, but it usually applies
- * to large sets of naturally occurring numbers with some connection like:
- *
- * Companies' stock market values
- * Data found in texts — like the Reader's Digest, or a copy of Newsweek
- * Demographic data, including state and city populations
- * Income tax data
- * Mathematical tables, like logarithms
- * River drainage rates
- * Scientific data
- *
- * The law usually doesn't apply to data sets that have a stated minimum and
- * maximum, like interest rates or hourly wages. If numbers are assigned,
- * rather than naturally occurring, they will also not follow the law. Examples
- * of assigned numbers include: zip codes, telephone numbers and Social
- * Security numbers.
- *
- * For more information: https://en.wikipedia.org/wiki/Benford%27s_law
- *
- * This function computes the distribution of digits within one or more
- * attributes in a dataset and displays the result, one attribute per row,
- * with an "expected" row showing the expected distributions. Included
- * in each data row is a chi-squared computation for that row indicating how
- * well the computed result matches the expected result (if the chi-squared
- * value exceeds the one shown in the --EXPECTED-- row then the data row
- * DOES NOT follow Benford's Law).
- *
- * Note that when computing the distribution of the most significant digit,
- * the digit zero is ignored. So for instance, the values 0100, 100, 1.0,
- * 0.10, and 0.00001 all have a most-significant digit of '1'. The digit
- * zero is considered for all other positions.
- *
- * @param inFile The dataset to process; REQUIRED
- * @param fieldListStr A string containing a comma-delimited list of
- * attribute names to process; note that attributes
- * listed here must be top-level attributes (not child
- * records or child datasets); use an empty string to
- * process all top-level attributes in inFile;
- * OPTIONAL, defaults to an empty string
- * @param digit The 1-based digit within the number to examine; the
- * first significant digit is '1' and it only increases;
- * OPTIONAL, defaults to 1, meaning the most-significant
- * non-zero digit
- * @param sampleSize A positive integer representing a percentage of
- * inFile to examine, which is useful when analyzing a
- * very large dataset and only an estimated data
- * analysis is sufficient; valid range for this
- * argument is 1-100; values outside of this range
- * will be clamped; OPTIONAL, defaults to 100 (which
- * indicates that all rows in the dataset will be used)
- *
- * @return A new dataset with the following record structure:
- *
- * RECORD
- * STRING attribute; // Name of data attribute examined
- * DECIMAL4_1 zero; // Percentage of rows with digit of '0'
- * DECIMAL4_1 one; // Percentage of rows with digit of '1'
- * DECIMAL4_1 two; // Percentage of rows with digit of '2'
- * DECIMAL4_1 three; // Percentage of rows with digit of '3'
- * DECIMAL4_1 four; // Percentage of rows with digit of '4'
- * DECIMAL4_1 five; // Percentage of rows with digit of '5'
- * DECIMAL4_1 six; // Percentage of rows with digit of '6'
- * DECIMAL4_1 seven; // Percentage of rows with digit of '7'
- * DECIMAL4_1 eight; // Percentage of rows with digit of '8'
- * DECIMAL4_1 nine; // Percentage of rows with digit of '9'
- * DECIMAL7_3 chi_squared; // Chi-squared "fitness test" result
- * UNSIGNED8 num_values; // Number of rows with non-zero values for this attribute
- * END;
- *
- * The named digit fields (e.g. "zero" and "one" and so on) represent the
- * digit found in the 'digit' position of the associated attribute. The values
- * that appear there are percentages. num_values shows the number of
- * non-zero values processed, and chi_squared shows the result of applying
- * that test using the observed vs expected distribution values.
- *
- * The first row of the results will show the expected values for the named
- * digits, with "-- EXPECTED DIGIT n --" showing as the attribute name.'n' will
- * be replaced with the value of 'digit' which indicates which digit position
- * was examined.
- *
- * Note that when viewing the results for the mosts significant digit (digit = 1),
- * the 'zero' field will show a -1 value, indicating that it was ignored.
- */
- EXPORT Benford(inFile, fieldListStr = '\'\'', digit = 1, sampleSize = 100) := FUNCTIONMACRO
- #UNIQUENAME(minDigit);
- LOCAL %minDigit% := MAX((INTEGER)digit, 1);
- #UNIQUENAME(clampedDigit);
- LOCAL %clampedDigit% := MIN(%minDigit%, 4);
- // Chi-squared critical value table:
- // https://www.itl.nist.gov/div898/handbook/eda/section3/eda3674.htm
- // Chi-squared critical values for 8 degrees of freedom at various probabilities
- // Probability: 0.90 0.95 0.975 0.99 0.999
- // Critical value: 13.362 15.507 17.535 20.090 26.125
- #UNIQUENAME(CHI_SQUARED_CRITICAL_VALUE_1);
- #SET(CHI_SQUARED_CRITICAL_VALUE_1, 20.090); // 99% probability
- // Chi-squared critical values for 9 degrees of freedom at various probabilities
- // Probability: 0.90 0.95 0.975 0.99 0.999
- // Critical value: 14.684 16.919 19.023 21.666 27.877
- #UNIQUENAME(CHI_SQUARED_CRITICAL_VALUE_2);
- #SET(CHI_SQUARED_CRITICAL_VALUE_2, 21.666); // 99% probability
- #UNIQUENAME(CHI_SQUARED_CRITICAL_VALUE);
- LOCAL %CHI_SQUARED_CRITICAL_VALUE% := IF(%clampedDigit% = 1, %CHI_SQUARED_CRITICAL_VALUE_1%, %CHI_SQUARED_CRITICAL_VALUE_2%);
- #UNIQUENAME(expectedDistribution);
- LOCAL %expectedDistribution% := DATASET
- (
- [
- {1, -1, 30.1, 17.6, 12.5, 9.7, 7.9, 6.7, 5.8, 5.1, 4.6},
- {2, 12.0, 11.4, 10.9, 10.4, 10.0, 9.7, 9.3, 9.0, 8.8, 8.5},
- {3, 10.2, 10.1, 10.1, 10.1, 10.0, 10.0, 9.9, 9.9, 9.9, 9.8},
- {4, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0}
- ],
- {
- UNSIGNED1 pos,
- DECIMAL4_1 zero,
- DECIMAL4_1 one,
- DECIMAL4_1 two,
- DECIMAL4_1 three,
- DECIMAL4_1 four,
- DECIMAL4_1 five,
- DECIMAL4_1 six,
- DECIMAL4_1 seven,
- DECIMAL4_1 eight,
- DECIMAL4_1 nine
- }
- );
- // Remove all spaces from field list so we can parse it more easily
- #UNIQUENAME(trimmedFieldList);
- LOCAL %trimmedFieldList% := TRIM((STRING)fieldListStr, ALL);
- // Ungroup the given dataset, in case it was grouped
- #UNIQUENAME(ungroupedInFile);
- LOCAL %ungroupedInFile% := UNGROUP(inFile);
- // Clamp the sample size to something reasonable
- #UNIQUENAME(clampedSampleSize);
- LOCAL %clampedSampleSize% := MAX(1, MIN(100, (INTEGER)sampleSize));
- // Create a sample dataset if needed
- #UNIQUENAME(sampledData);
- LOCAL %sampledData% := IF
- (
- %clampedSampleSize% < 100,
- ENTH(%ungroupedInFile%, %clampedSampleSize%, 100, 1, LOCAL),
- %ungroupedInFile%
- );
- // Slim the dataset if the caller provided an explicit set of attributes;
- // note that the TABLE function will fail if %trimmedFieldList% cites an
- // attribute that is a child dataset (this is an ECL limitation)
- #UNIQUENAME(workingInFile);
- LOCAL %workingInFile% :=
- #IF(%trimmedFieldList% = '')
- %sampledData%
- #ELSE
- TABLE(%sampledData%, {#EXPAND(%trimmedFieldList%)})
- #END;
- // Helper function that returns the 'pos' significant digit in a string;
- // if pos = 1 then th digit must be non-zero; returns 10
- // (an invalid *digit*) if no suitable digit is found
- #UNIQUENAME(NthDigit);
- LOCAL UNSIGNED1 %NthDigit%(STRING s, UNSIGNED1 pos) := EMBED(C++)
- #option pure
- unsigned char foundDigit = 10; // impossible value
- int digitsFound = 0;
- for (unsigned int x = 0; x < lenS; x++)
- {
- char ch = s[x];
- if (isdigit(ch) && (digitsFound > 0 || ch != '0'))
- {
- ++digitsFound;
- if (digitsFound >= pos)
- {
- foundDigit = ch - '0';
- break;
- }
- // Once we find a significant digit, the default return value
- // is a trailing zero (assumed after an implied decimal point
- // if we're parsing an integer)
- foundDigit = 0;
- }
- else if (ch == '.')
- {
- // Once we find a decimal point, the default return value
- // is a trailing zero
- foundDigit = 0;
- }
- }
- return foundDigit;
- ENDEMBED;
- // Temp field name we will use to ensure proper ordering of results
- #UNIQUENAME(idField);
- // One-record dataset containing expected Benford results, per-digit
- #UNIQUENAME(expectedDS);
- LOCAL %expectedDS% := PROJECT
- (
- %expectedDistribution%(pos = %clampedDigit%),
- TRANSFORM
- (
- {
- UNSIGNED2 %idField%,
- RECORDOF(LEFT) - [pos],
- DECIMAL7_3 chi_squared,
- UNSIGNED8 num_values,
- STRING attribute // Put this at the end for now, because it is variable-length
- },
- SELF.%idField% := 0,
- SELF.chi_squared := 0,
- SELF.num_values := COUNT(%workingInFile%),
- SELF.attribute := '-- EXPECTED DIGIT ' + (STRING)%minDigit% + ' --',
- SELF := LEFT
- )
- );
- // This will be used later as a datatype in a function signature
- #UNIQUENAME(DataRec);
- LOCAL %DataRec% := RECORDOF(%expectedDS%);
- // Get the internal representation of our working dataset
- #EXPORTXML(inFileFields, RECORDOF(%workingInFile%));
- // Create a dataset composed of the expectedDS and a row for each
- // field we will be processing
- #UNIQUENAME(interimResult);
- LOCAL %interimResult% := %expectedDS%
- #UNIQUENAME(recLevel)
- #SET(recLevel, 0)
- #UNIQUENAME(fieldNum)
- #SET(fieldNum, 0)
- #FOR(inFileFields)
- #FOR(Field)
- #IF(%{@isRecord}% = 1 OR %{@isDataset}% = 1)
- #SET(recLevel, %recLevel% + 1)
- #ELSEIF(%{@isEnd}% = 1)
- #SET(recLevel, %recLevel% - 1)
- #ELSEIF(%recLevel% = 0)
- #SET(fieldNum, %fieldNum% + 1)
- + TABLE
- (
- PROJECT(%workingInFile%, TRANSFORM({UNSIGNED1 n}, SELF.n := %NthDigit%((STRING)LEFT.%@name%, %minDigit%)))(n != 10),
- {
- UNSIGNED2 %idField% := %fieldNum%,
- DECIMAL4_1 zero := IF(%minDigit% = 1, -1.0, COUNT(GROUP, n = 0) / COUNT(GROUP) * 100),
- DECIMAL4_1 one := COUNT(GROUP, n = 1) / COUNT(GROUP) * 100,
- DECIMAL4_1 two := COUNT(GROUP, n = 2) / COUNT(GROUP) * 100,
- DECIMAL4_1 three := COUNT(GROUP, n = 3) / COUNT(GROUP) * 100,
- DECIMAL4_1 four := COUNT(GROUP, n = 4) / COUNT(GROUP) * 100,
- DECIMAL4_1 five := COUNT(GROUP, n = 5) / COUNT(GROUP) * 100,
- DECIMAL4_1 six := COUNT(GROUP, n = 6) / COUNT(GROUP) * 100,
- DECIMAL4_1 seven := COUNT(GROUP, n = 7) / COUNT(GROUP) * 100,
- DECIMAL4_1 eight := COUNT(GROUP, n = 8) / COUNT(GROUP) * 100,
- DECIMAL4_1 nine := COUNT(GROUP, n = 9) / COUNT(GROUP) * 100,
- DECIMAL7_3 chi_squared := 0, // Fill in later
- UNSIGNED8 num_values := COUNT(GROUP),
- STRING attribute := %'@name'%
- },
- MERGE
- )
- #END
- #END
- #END;
- // Helper function for computing chi-squared values from the interim results
- #UNIQUENAME(ComputeChiSquared);
- LOCAL %ComputeChiSquared%(%DataRec% expected, %DataRec% actual, UNSIGNED1 pos) := FUNCTION
- Term(DECIMAL4_1 e, DECIMAL4_1 o) := (((o - e) * (o - e)) / e);
- RETURN Term(expected.one, actual.one)
- + Term(expected.two, actual.two)
- + Term(expected.three, actual.three)
- + Term(expected.four, actual.four)
- + Term(expected.five, actual.five)
- + Term(expected.six, actual.six)
- + Term(expected.seven, actual.seven)
- + Term(expected.eight, actual.eight)
- + Term(expected.nine, actual.nine)
- + IF(pos = 1, 0, Term(expected.zero, actual.zero));
- END;
- // Insert the chi-squared results
- #UNIQUENAME(chiSquaredResult);
- LOCAL %chiSquaredResult% := PROJECT
- (
- %interimResult%,
- TRANSFORM
- (
- RECORDOF(LEFT),
- SELF.chi_squared := IF(LEFT.%idField% > 0, %ComputeChiSquared%(%expectedDS%[1], LEFT, %clampedDigit%), %CHI_SQUARED_CRITICAL_VALUE%),
- SELF := LEFT
- )
- );
- // Rewrite the result into our final format; changes:
- // - Sort by ID field to put rows into the proper order
- // - Remove the ID field
- // - Make the attribute field first
- #UNIQUENAME(finalResult);
- LOCAL %finalResult% := PROJECT
- (
- SORT(%chiSquaredResult%, %idField%),
- {
- STRING attribute,
- RECORDOF(%chiSquaredResult%) - [%idField%, attribute]
- }
- );
- RETURN %finalResult%;
- ENDMACRO;
|