Profile.ecl 84 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608
  1. /***
  2. * Function macro for profiling all or part of a dataset. The output is a
  3. * dataset containing the following information for each profiled attribute:
  4. *
  5. * attribute The name of the attribute
  6. * given_attribute_type The ECL type of the attribute as it was defined
  7. * in the input dataset
  8. * best_attribute_type An ECL data type that both allows all values
  9. * in the input dataset and consumes the least
  10. * amount of memory
  11. * rec_count The number of records analyzed in the dataset;
  12. * this may be fewer than the total number of
  13. * records, if the optional sampleSize argument
  14. * was provided with a value less than 100
  15. * fill_count The number of rec_count records containing
  16. * non-nil values; a 'nil value' is an empty
  17. * string, a numeric zero, or an empty SET; note
  18. * that BOOLEAN attributes are always counted as
  19. * filled, regardless of their value; also,
  20. * fixed-length DATA attributes (e.g. DATA10) are
  21. * also counted as filled, given their typical
  22. * function of holding data blobs
  23. * fill_rate The percentage of rec_count records containing
  24. * non-nil values; this is basically
  25. * fill_count / rec_count * 100
  26. * cardinality The number of unique, non-nil values within
  27. * the attribute
  28. * cardinality_breakdown For those attributes with a low number of
  29. * unique, non-nil values, show each value and the
  30. * number of records containing that value; the
  31. * lcbLimit parameter governs what "low number"
  32. * means
  33. * modes The most common values in the attribute, after
  34. * coercing all values to STRING, along with the
  35. * number of records in which the values were
  36. * found; if no value is repeated more than once
  37. * then no mode will be shown; up to five (5)
  38. * modes will be shown; note that string values
  39. * longer than the maxPatternLen argument will
  40. * be truncated
  41. * min_length For SET datatypes, the fewest number of elements
  42. * found in the set; for other data types, the
  43. * shortest length of a value when expressed
  44. * as a string; null values are ignored
  45. * max_length For SET datatypes, the largest number of elements
  46. * found in the set; for other data types, the
  47. * longest length of a value when expressed
  48. * as a string; null values are ignored
  49. * ave_length For SET datatypes, the average number of elements
  50. * found in the set; for other data types, the
  51. * average length of a value when expressed
  52. * as a string; null values are ignored
  53. * popular_patterns The most common patterns of values; see below
  54. * rare_patterns The least common patterns of values; see below
  55. * is_numeric Boolean indicating if the original attribute
  56. * was a numeric scalar or if the best_attribute_type
  57. * value was a numeric scaler; if TRUE then the
  58. * numeric_xxxx output fields will be
  59. * populated with actual values; if this value
  60. * is FALSE then all numeric_xxxx output values
  61. * should be ignored
  62. * numeric_min The smallest non-nil value found within the
  63. * attribute as a DECIMAL; this value is valid only
  64. * if is_numeric is TRUE; if is_numeric is FALSE
  65. * then zero will show here
  66. * numeric_max The largest non-nil value found within the
  67. * attribute as a DECIMAL;this value is valid only
  68. * if is_numeric is TRUE; if is_numeric is FALSE
  69. * then zero will show here
  70. * numeric_mean The mean (average) non-nil value found within
  71. * the attribute as a DECIMAL; this value is valid
  72. * only if is_numeric is TRUE; if is_numeric is FALSE
  73. * then zero will show here
  74. * numeric_std_dev The standard deviation of the non-nil values
  75. * in the attribute as a DECIMAL; this value is valid
  76. * only if is_numeric is TRUE; if is_numeric is FALSE
  77. * then zero will show here
  78. * numeric_lower_quartile The value separating the first (bottom) and
  79. * second quarters of non-nil values within
  80. * the attribute as a DECIMAL; this value is valid only
  81. * if is_numeric is TRUE; if is_numeric is FALSE
  82. * then zero will show here
  83. * numeric_median The median non-nil value within the attribute
  84. * as a DECIMAL; this value is valid only
  85. * if is_numeric is TRUE; if is_numeric is FALSE
  86. * then zero will show here
  87. * numeric_upper_quartile The value separating the third and fourth
  88. * (top) quarters of non-nil values within
  89. * the attribute as a DECIMAL; this value is valid only
  90. * if is_numeric is TRUE; if is_numeric is FALSE
  91. * then zero will show here
  92. * correlations A child dataset containing correlation values
  93. * comparing the current numeric attribute with all
  94. * other numeric attributes, listed in descending
  95. * correlation value order; the attribute must be
  96. * a numeric ECL datatype; non-numeric attributes
  97. * will return an empty child dataset; note that
  98. * this can be a time-consuming operation,
  99. * depending on the number of numeric attributes
  100. * in your dataset and the number of rows (if you
  101. * have N numeric attributes, then
  102. * N * (N - 1) / 2 calculations are performed,
  103. * each scanning all data rows)
  104. *
  105. * Most profile outputs can be disabled. See the 'features' argument, below.
  106. *
  107. * Data patterns can give you an idea of what your data looks like when it is
  108. * expressed as a (human-readable) string. The function converts each
  109. * character of the string into a fixed character palette to producing a "data
  110. * pattern" and then counts the number of unique patterns for that attribute.
  111. * The most- and least-popular patterns from the data will be shown in the
  112. * output, along with the number of times that pattern appears and an example
  113. * (randomly chosen from the actual data). The character palette used is:
  114. *
  115. * A Any uppercase letter
  116. * a Any lowercase letter
  117. * 9 Any numeric digit
  118. * B A boolean value (true or false)
  119. *
  120. * All other characters are left as-is in the pattern.
  121. *
  122. * Function parameters:
  123. *
  124. * @param inFile The dataset to process; this could be a child
  125. * dataset (e.g. inFile.childDS); REQUIRED
  126. * @param fieldListStr A string containing a comma-delimited list of
  127. * attribute names to process; use an empty string to
  128. * process all attributes in inFile; OPTIONAL,
  129. * defaults to an empty string
  130. * @param maxPatterns The maximum number of patterns (both popular and
  131. * rare) to return for each attribute; OPTIONAL,
  132. * defaults to 100
  133. * @param maxPatternLen The maximum length of a pattern; longer patterns
  134. * are truncated in the output; this value is also
  135. * used to set the maximum length of the data to
  136. * consider when finding cardinality and mode values;
  137. * must be 33 or larger; OPTIONAL, defaults to 100
  138. * @param features A comma-delimited string listing the profiling
  139. * elements to be included in the output; OPTIONAL,
  140. * defaults to a comma-delimited string containing all
  141. * of the available keywords:
  142. * KEYWORD AFFECTED OUTPUT
  143. * fill_rate fill_rate
  144. * fill_count
  145. * cardinality cardinality
  146. * cardinality_breakdown cardinality_breakdown
  147. * best_ecl_types best_attribute_type
  148. * modes modes
  149. * lengths min_length
  150. * max_length
  151. * ave_length
  152. * patterns popular_patterns
  153. * rare_patterns
  154. * min_max numeric_min
  155. * numeric_max
  156. * mean numeric_mean
  157. * std_dev numeric_std_dev
  158. * quartiles numeric_lower_quartile
  159. * numeric_median
  160. * numeric_upper_quartile
  161. * correlations correlations
  162. * To omit the output associated with a single keyword,
  163. * set this argument to a comma-delimited string
  164. * containing all other keywords; note that the
  165. * is_numeric output will appear only if min_max,
  166. * mean, std_dev, quartiles, or correlations features
  167. * are active; also note that enabling the
  168. * cardinality_breakdown feature will also enable
  169. * the cardinality feature, even if it is not
  170. * explicitly enabled
  171. * @param sampleSize A positive integer representing a percentage of
  172. * inFile to examine, which is useful when analyzing a
  173. * very large dataset and only an estimated data
  174. * profile is sufficient; valid range for this
  175. * argument is 1-100; values outside of this range
  176. * will be clamped; OPTIONAL, defaults to 100 (which
  177. * indicates that the entire dataset will be analyzed)
  178. * @param lcbLimit A positive integer (<= 1000) indicating the maximum
  179. * cardinality allowed for an attribute in order to
  180. * emit a breakdown of the attribute's values; this
  181. * parameter will be ignored if cardinality_breakdown
  182. * is not included in the features argument; OPTIONAL,
  183. * defaults to 64
  184. */
  185. EXPORT Profile(inFile,
  186. fieldListStr = '\'\'',
  187. maxPatterns = 100,
  188. maxPatternLen = 100,
  189. features = '\'fill_rate,best_ecl_types,cardinality,cardinality_breakdown,modes,lengths,patterns,min_max,mean,std_dev,quartiles,correlations\'',
  190. sampleSize = 100,
  191. lcbLimit = 64) := FUNCTIONMACRO
  192. LOADXML('<xml/>');
  193. #UNIQUENAME(temp); // Ubiquitous "contains random things"
  194. #UNIQUENAME(scalarFields); // Contains a delimited list of scalar attributes (full names) along with their indexed positions
  195. #UNIQUENAME(explicitScalarFields); // Contains a delimited list of scalar attributes (full names) without indexed positions
  196. #UNIQUENAME(childDSFields); // Contains a delimited list of child dataset attributes (full names) along with their indexed positions
  197. #UNIQUENAME(fieldCount); // Contains the number of fields we've seen while processing record layouts
  198. #UNIQUENAME(recLevel); // Will be used to determine at which level we are processing
  199. #UNIQUENAME(fieldStack); // String-based stack telling us whether we're within an embedded dataset or record
  200. #UNIQUENAME(namePrefix); // When processing child records and datasets, contains the leading portion of the attribute's full name
  201. #UNIQUENAME(fullName); // The full name of an attribute
  202. #UNIQUENAME(needsDelim); // Boolean indicating whether we need to insert a delimiter somewhere
  203. #UNIQUENAME(namePos); // Contains character offset information, for parsing delimited strings
  204. #UNIQUENAME(numValue); // Extracted numeric value from a string
  205. #UNIQUENAME(nameValue); // Extracted string value from a string
  206. IMPORT Std;
  207. //--------------------------------------------------------------------------
  208. // Remove all spaces from features list so we can parse it more easily
  209. #UNIQUENAME(trimmedFeatures);
  210. LOCAL %trimmedFeatures% := TRIM(features, ALL);
  211. // Remove all spaces from field list so we can parse it more easily
  212. #UNIQUENAME(trimmedFieldList);
  213. LOCAL %trimmedFieldList% := TRIM((STRING)fieldListStr, ALL);
  214. // Clamp lcbLimit to 0..1000
  215. #UNIQUENAME(lowCardinalityThreshold);
  216. LOCAL %lowCardinalityThreshold% := MIN(MAX(lcbLimit, 0), 1000);
  217. // The maximum number of mode values to return
  218. #UNIQUENAME(MAX_MODES);
  219. LOCAL %MAX_MODES% := 5;
  220. // Typedefs
  221. #UNIQUENAME(Attribute_t);
  222. LOCAL %Attribute_t% := STRING;
  223. #UNIQUENAME(AttributeType_t);
  224. LOCAL %AttributeType_t% := STRING36;
  225. #UNIQUENAME(NumericStat_t);
  226. LOCAL %NumericStat_t% := DECIMAL32_4;
  227. // Tests for enabled features
  228. #UNIQUENAME(FeatureEnabledFillRate);
  229. LOCAL %FeatureEnabledFillRate%() := REGEXFIND('\\bfill_rate\\b', %trimmedFeatures%, NOCASE);
  230. #UNIQUENAME(FeatureEnabledBestECLTypes);
  231. LOCAL %FeatureEnabledBestECLTypes%() := REGEXFIND('\\bbest_ecl_types\\b', %trimmedFeatures%, NOCASE);
  232. #UNIQUENAME(FeatureEnabledLowCardinalityBreakdown);
  233. LOCAL %FeatureEnabledLowCardinalityBreakdown%() := %lowCardinalityThreshold% > 0 AND REGEXFIND('\\bcardinality_breakdown\\b', %trimmedFeatures%, NOCASE);
  234. #UNIQUENAME(FeatureEnabledCardinality);
  235. LOCAL %FeatureEnabledCardinality%() := %FeatureEnabledLowCardinalityBreakdown%() OR REGEXFIND('\\bcardinality\\b', %trimmedFeatures%, NOCASE);
  236. #UNIQUENAME(FeatureEnabledModes);
  237. LOCAL %FeatureEnabledModes%() := REGEXFIND('\\bmodes\\b', %trimmedFeatures%, NOCASE);
  238. #UNIQUENAME(FeatureEnabledLengths);
  239. LOCAL %FeatureEnabledLengths%() := REGEXFIND('\\blengths\\b', %trimmedFeatures%, NOCASE);
  240. #UNIQUENAME(FeatureEnabledPatterns);
  241. LOCAL %FeatureEnabledPatterns%() := (UNSIGNED)maxPatterns > 0 AND REGEXFIND('\\bpatterns\\b', %trimmedFeatures%, NOCASE);
  242. #UNIQUENAME(FeatureEnabledMinMax);
  243. LOCAL %FeatureEnabledMinMax%() := REGEXFIND('\\bmin_max\\b', %trimmedFeatures%, NOCASE);
  244. #UNIQUENAME(FeatureEnabledMean);
  245. LOCAL %FeatureEnabledMean%() := REGEXFIND('\\bmean\\b', %trimmedFeatures%, NOCASE);
  246. #UNIQUENAME(FeatureEnabledStdDev);
  247. LOCAL %FeatureEnabledStdDev%() := REGEXFIND('\\bstd_dev\\b', %trimmedFeatures%, NOCASE);
  248. #UNIQUENAME(FeatureEnabledQuartiles);
  249. LOCAL %FeatureEnabledQuartiles%() := REGEXFIND('\\bquartiles\\b', %trimmedFeatures%, NOCASE);
  250. #UNIQUENAME(FeatureEnabledCorrelations);
  251. LOCAL %FeatureEnabledCorrelations%() := REGEXFIND('\\bcorrelations\\b', %trimmedFeatures%, NOCASE);
  252. //--------------------------------------------------------------------------
  253. // Ungroup the given dataset, in case it was grouped
  254. #UNIQUENAME(ungroupedInFile);
  255. LOCAL %ungroupedInFile% := UNGROUP(inFile);
  256. // Clamp the sample size to something reasonable
  257. #UNIQUENAME(clampedSampleSize);
  258. LOCAL %clampedSampleSize% := MAX(1, MIN(100, (INTEGER)sampleSize));
  259. // Create a sample dataset if needed
  260. #UNIQUENAME(sampledData);
  261. LOCAL %sampledData% := IF
  262. (
  263. %clampedSampleSize% < 100,
  264. ENTH(%ungroupedInFile%, %clampedSampleSize%, 100, 1, LOCAL),
  265. %ungroupedInFile%
  266. );
  267. // Slim the dataset if the caller provided an explicit set of attributes
  268. #UNIQUENAME(workingInFile);
  269. LOCAL %workingInFile% :=
  270. #IF(%trimmedFieldList% = '')
  271. %sampledData%
  272. #ELSE
  273. TABLE
  274. (
  275. %sampledData%,
  276. {
  277. #SET(needsDelim, 0)
  278. #SET(namePos, 1)
  279. #LOOP
  280. #SET(temp, REGEXFIND('^([^,]+)', %trimmedFieldList%[%namePos%..], 1))
  281. #IF(%'temp'% != '')
  282. #IF(%needsDelim% = 1) , #END
  283. TYPEOF(%sampledData%.%temp%) %temp% := %temp%
  284. #SET(needsDelim, 1)
  285. #SET(namePos, %namePos% + LENGTH(%'temp'%) + 1)
  286. #ELSE
  287. #BREAK
  288. #END
  289. #END
  290. }
  291. )
  292. #END;
  293. // Distribute the inbound dataset across all our nodes for faster processing
  294. #UNIQUENAME(distributedInFile);
  295. LOCAL %distributedInFile% := DISTRIBUTE(%workingInFile%, SKEW(0.05));
  296. #EXPORTXML(inFileFields, RECORDOF(%distributedInFile%));
  297. // Walk the slimmed dataset, pulling out top-level scalars and noting
  298. // child datasets
  299. #SET(scalarFields, '');
  300. #SET(childDSFields, '');
  301. #SET(fieldCount, 0);
  302. #SET(recLevel, 0);
  303. #SET(fieldStack, '');
  304. #SET(namePrefix, '');
  305. #SET(fullName, '');
  306. #FOR(inFileFields)
  307. #FOR(Field)
  308. #SET(fieldCount, %fieldCount% + 1)
  309. #IF(%{@isEnd}% != 1)
  310. // Adjust full name
  311. #SET(fullName, %'namePrefix'% + %'@name'%)
  312. #END
  313. #IF(%{@isRecord}% = 1)
  314. // Push record onto stack so we know what we're popping when we see @isEnd
  315. #SET(fieldStack, 'r' + %'fieldStack'%)
  316. #APPEND(namePrefix, %'@name'% + '.')
  317. #ELSEIF(%{@isDataset}% = 1)
  318. // Push dataset onto stack so we know what we're popping when we see @isEnd
  319. #SET(fieldStack, 'd' + %'fieldStack'%)
  320. #APPEND(namePrefix, %'@name'% + '.')
  321. #SET(recLevel, %recLevel% + 1)
  322. // Note the field index and field name so we can process it separately
  323. #IF(%'childDSFields'% != '')
  324. #APPEND(childDSFields, ',')
  325. #END
  326. #APPEND(childDSFields, %'fieldCount'% + ':' + %'fullName'%)
  327. // Extract the child dataset into its own attribute so we can more easily
  328. // process it later
  329. #SET(temp, #MANGLE(%'fullName'%));
  330. LOCAL %temp% := NORMALIZE
  331. (
  332. %distributedInFile%,
  333. LEFT.%fullName%,
  334. TRANSFORM
  335. (
  336. RECORDOF(%distributedInFile%.%fullName%),
  337. SELF := RIGHT
  338. )
  339. );
  340. #ELSEIF(%{@isEnd}% = 1)
  341. #SET(namePrefix, REGEXREPLACE('\\w+\\.$', %'namePrefix'%, ''))
  342. #IF(%'fieldStack'%[1] = 'd')
  343. #SET(recLevel, %recLevel% - 1)
  344. #END
  345. #SET(fieldStack, %'fieldStack'%[2..])
  346. #ELSEIF(%recLevel% = 0)
  347. // Note the field index and full name of the attribute so we can process it
  348. #IF(%'scalarFields'% != '')
  349. #APPEND(scalarFields, ',')
  350. #END
  351. #APPEND(scalarFields, %'fieldCount'% + ':' + %'fullName'%)
  352. #END
  353. #END
  354. #END
  355. // Collect the gathered full attribute names so we can walk them later
  356. #SET(explicitScalarFields, REGEXREPLACE('\\d+:', %'scalarFields'%, ''));
  357. // Define the record layout that will be used by the inner _Inner_Profile() call
  358. LOCAL ModeRec := RECORD
  359. STRING value;
  360. UNSIGNED4 rec_count;
  361. END;
  362. LOCAL PatternCountRec := RECORD
  363. STRING data_pattern;
  364. UNSIGNED4 rec_count;
  365. STRING example;
  366. END;
  367. LOCAL CorrelationRec := RECORD
  368. STRING attribute;
  369. DECIMAL7_6 corr;
  370. END;
  371. LOCAL OutputLayout := RECORD
  372. STRING sortValue;
  373. STRING attribute;
  374. UNSIGNED4 rec_count;
  375. STRING given_attribute_type;
  376. DECIMAL9_6 fill_rate;
  377. UNSIGNED4 fill_count;
  378. UNSIGNED4 cardinality;
  379. DATASET(ModeRec) cardinality_breakdown {MAXCOUNT(%lowCardinalityThreshold%)};
  380. STRING best_attribute_type;
  381. DATASET(ModeRec) modes {MAXCOUNT(%MAX_MODES%)};
  382. UNSIGNED4 min_length;
  383. UNSIGNED4 max_length;
  384. UNSIGNED4 ave_length;
  385. DATASET(PatternCountRec) popular_patterns {MAXCOUNT((UNSIGNED)maxPatterns)};
  386. DATASET(PatternCountRec) rare_patterns {MAXCOUNT((UNSIGNED)maxPatterns)};
  387. BOOLEAN is_numeric;
  388. %NumericStat_t% numeric_min;
  389. %NumericStat_t% numeric_max;
  390. %NumericStat_t% numeric_mean;
  391. %NumericStat_t% numeric_std_dev;
  392. %NumericStat_t% numeric_lower_quartile;
  393. %NumericStat_t% numeric_median;
  394. %NumericStat_t% numeric_upper_quartile;
  395. DATASET(CorrelationRec) correlations {MAXCOUNT(%fieldCount%)};
  396. END;
  397. // Define the record layout that will be returned to the caller; note
  398. // that the structure is variable, depending on the features passed
  399. // to Profile()
  400. #UNIQUENAME(FinalOutputLayout);
  401. LOCAL %FinalOutputLayout% := RECORD
  402. STRING attribute;
  403. STRING given_attribute_type;
  404. #IF(%FeatureEnabledBestECLTypes%())
  405. STRING best_attribute_type;
  406. #END
  407. UNSIGNED4 rec_count;
  408. #IF(%FeatureEnabledFillRate%())
  409. UNSIGNED4 fill_count;
  410. DECIMAL9_6 fill_rate;
  411. #END
  412. #IF(%FeatureEnabledCardinality%())
  413. UNSIGNED4 cardinality;
  414. #END
  415. #IF(%FeatureEnabledLowCardinalityBreakdown%())
  416. DATASET(ModeRec) cardinality_breakdown;
  417. #END
  418. #IF(%FeatureEnabledModes%())
  419. DATASET(ModeRec) modes;
  420. #END
  421. #IF(%FeatureEnabledLengths%())
  422. UNSIGNED4 min_length;
  423. UNSIGNED4 max_length;
  424. UNSIGNED4 ave_length;
  425. #END
  426. #IF(%FeatureEnabledPatterns%())
  427. DATASET(PatternCountRec) popular_patterns;
  428. DATASET(PatternCountRec) rare_patterns;
  429. #END
  430. #IF(%FeatureEnabledMinMax%() OR %FeatureEnabledMean%() OR %FeatureEnabledStdDev%() OR %FeatureEnabledQuartiles%() OR %FeatureEnabledCorrelations%())
  431. BOOLEAN is_numeric;
  432. #END
  433. #IF(%FeatureEnabledMinMax%())
  434. %NumericStat_t% numeric_min;
  435. %NumericStat_t% numeric_max;
  436. #END
  437. #IF(%FeatureEnabledMean%())
  438. %NumericStat_t% numeric_mean;
  439. #END
  440. #IF(%FeatureEnabledStdDev%())
  441. %NumericStat_t% numeric_std_dev;
  442. #END
  443. #IF(%FeatureEnabledQuartiles%())
  444. %NumericStat_t% numeric_lower_quartile;
  445. %NumericStat_t% numeric_median;
  446. %NumericStat_t% numeric_upper_quartile;
  447. #END
  448. #IF(%FeatureEnabledCorrelations%())
  449. DATASET(CorrelationRec) correlations;
  450. #END
  451. END;
  452. //==========================================================================
  453. // This is the meat of the function macro that actually does the profiling;
  454. // it is called with various datasets and (possibly) explicit attributes
  455. // to process and the results will eventually be combined to form the
  456. // final result; the parameters largely match the Profile() call, with the
  457. // addition of a few parameters that help place the results into the
  458. // correct format; note that the name of this function macro is not wrapped
  459. // in a UNIQUENAME -- that is due to an apparent limitation in the ECL
  460. // compiler
  461. LOCAL _Inner_Profile(_inFile,
  462. _fieldListStr,
  463. _maxPatterns,
  464. _maxPatternLen,
  465. _lcbLimit,
  466. _maxModes,
  467. _resultLayout,
  468. _attrNamePrefix,
  469. _sortPrefix) := FUNCTIONMACRO
  470. #EXPORTXML(inFileFields, RECORDOF(_inFile));
  471. #UNIQUENAME(foundMaxPatternLen); // Will become the length of the longest pattern we will be processing
  472. #SET(foundMaxPatternLen, 33); // Preset to minimum length for an attribute pattern
  473. #UNIQUENAME(explicitFields); // Attributes from _fieldListStr that are found in the top level of the dataset
  474. #UNIQUENAME(numericFields); // Numeric attributes from _fieldListStr that are found in the top level of the dataset
  475. // Validate that attribute is okay for us to process (there is no explicit
  476. // attribute list or the name is in the list)
  477. #UNIQUENAME(_CanProcessAttribute);
  478. LOCAL %_CanProcessAttribute%(STRING attrName) := (_fieldListStr = '' OR REGEXFIND('(^|,)' + attrName + '(,|$)', _fieldListStr, NOCASE));
  479. // Test an attribute type to see if is a SET OF <something>
  480. #UNIQUENAME(_IsSetType);
  481. LOCAL %_IsSetType%(STRING attrType) := (attrType[..7] = 'set of ');
  482. // Helper function to convert a full field name into something we
  483. // can reference as an ECL attribute
  484. #UNIQUENAME(_MakeAttr);
  485. LOCAL %_MakeAttr%(STRING attr) := REGEXREPLACE('\\.', attr, '_');
  486. // Pattern mapping a STRING datatype
  487. #UNIQUENAME(_MapAllStr);
  488. LOCAL STRING %_MapAllStr%(STRING s) := EMBED(C++)
  489. #option pure;
  490. __lenResult = lenS;
  491. __result = static_cast<char*>(rtlMalloc(__lenResult));
  492. for (uint32_t x = 0; x < lenS; x++)
  493. {
  494. unsigned char ch = s[x];
  495. if (ch >= 'A' && ch <= 'Z')
  496. __result[x] = 'A';
  497. else if (ch >= 'a' && ch <= 'z')
  498. __result[x] = 'a';
  499. else if (ch >= '1' && ch <= '9') // Leave '0' as-is and replace with '9' later
  500. __result[x] = '9';
  501. else
  502. __result[x] = ch;
  503. }
  504. ENDEMBED;
  505. // Pattern mapping a UNICODE datatype; using regex due to the complexity
  506. // of the character set
  507. #UNIQUENAME(_MapUpperCharUni);
  508. LOCAL %_MapUpperCharUni%(UNICODE s) := REGEXREPLACE(u'[[:upper:]]', s, u'A');
  509. #UNIQUENAME(_MapLowerCharUni);
  510. LOCAL %_MapLowerCharUni%(UNICODE s) := REGEXREPLACE(u'[[:lower:]]', s, u'a');
  511. #UNIQUENAME(_MapDigitUni);
  512. LOCAL %_MapDigitUni%(UNICODE s) := REGEXREPLACE(u'[1-9]', s, u'9'); // Leave '0' as-is and replace with '9' later
  513. #UNIQUENAME(_MapAllUni);
  514. LOCAL %_MapAllUni%(UNICODE s) := (STRING)%_MapDigitUni%(%_MapLowerCharUni%(%_MapUpperCharUni%(s)));
  515. // Trimming strings
  516. #UNIQUENAME(_TrimmedStr);
  517. LOCAL %_TrimmedStr%(STRING s) := TRIM(s, LEFT, RIGHT);
  518. #UNIQUENAME(_TrimmedUni);
  519. LOCAL %_TrimmedUni%(UNICODE s) := TRIM(s, LEFT, RIGHT);
  520. // Collect a list of the top-level attributes that we can process,
  521. // determine the actual maximum length of a data pattern (if we can
  522. // reduce that length then we can save on memory allocation), and
  523. // collect the numeric fields for correlation
  524. #SET(needsDelim, 0);
  525. #SET(recLevel, 0);
  526. #SET(fieldStack, '');
  527. #SET(namePrefix, '');
  528. #SET(explicitFields, '');
  529. #SET(numericFields, '');
  530. #FOR(inFileFields)
  531. #FOR(Field)
  532. #IF(%{@isRecord}% = 1)
  533. #SET(fieldStack, 'r' + %'fieldStack'%)
  534. #APPEND(namePrefix, %'@name'% + '.')
  535. #ELSEIF(%{@isDataset}% = 1)
  536. #SET(fieldStack, 'd' + %'fieldStack'%)
  537. #SET(recLevel, %recLevel% + 1)
  538. #ELSEIF(%{@isEnd}% = 1)
  539. #IF(%'fieldStack'%[1] = 'd')
  540. #SET(recLevel, %recLevel% - 1)
  541. #ELSE
  542. #SET(namePrefix, REGEXREPLACE('\\w+\\.$', %'namePrefix'%, ''))
  543. #END
  544. #SET(fieldStack, %'fieldStack'%[2..])
  545. #ELSEIF(%recLevel% = 0)
  546. #IF(%_CanProcessAttribute%(%'namePrefix'% + %'@name'%))
  547. #IF(%needsDelim% = 1)
  548. #APPEND(explicitFields, ',')
  549. #END
  550. #APPEND(explicitFields, %'namePrefix'% + %'@name'%)
  551. #SET(needsDelim, 1)
  552. #IF(NOT %_IsSetType%(%'@type'%))
  553. #IF(REGEXFIND('(string)|(data)|(utf)', %'@type'%))
  554. #IF(%@size% < 0)
  555. #SET(foundMaxPatternLen, MAX(_maxPatternLen, %foundMaxPatternLen%))
  556. #ELSE
  557. #SET(foundMaxPatternLen, MIN(MAX(%@size%, %foundMaxPatternLen%), _maxPatternLen))
  558. #END
  559. #ELSEIF(REGEXFIND('unicode', %'@type'%))
  560. // UNICODE is UCS-2 so the size reflects two bytes per character
  561. #IF(%@size% < 0)
  562. #SET(foundMaxPatternLen, MAX(_maxPatternLen, %foundMaxPatternLen%))
  563. #ELSE
  564. #SET(foundMaxPatternLen, MIN(MAX(%@size% DIV 2 + 1, %foundMaxPatternLen%), _maxPatternLen))
  565. #END
  566. #ELSEIF(REGEXFIND('(integer)|(unsigned)|(decimal)|(real)', %'@type'%))
  567. #IF(%'numericFields'% != '')
  568. #APPEND(numericFields, ',')
  569. #END
  570. #APPEND(numericFields, %'namePrefix'% + %'@name'%)
  571. #END
  572. #END
  573. #END
  574. #END
  575. #END
  576. #END
  577. // Typedefs
  578. #UNIQUENAME(DataPattern_t);
  579. LOCAL %DataPattern_t% := #EXPAND('STRING' + %'foundMaxPatternLen'%);
  580. #UNIQUENAME(StringValue_t);
  581. LOCAL %StringValue_t% := #EXPAND('STRING' + %'foundMaxPatternLen'%);
  582. // Create a dataset containing pattern information, string length, and
  583. // booleans indicating filled and numeric datatypes for each processed
  584. // attribute; note that this is created by appending a series of PROJECT
  585. // results; to protect against skew problems when dealing with attributes
  586. // with low cardinality, and to attempt to reduce our temporary storage
  587. // footprint, create a reduced dataset that contains unique values for
  588. // our attributes and the number of times the values appear, as well as
  589. // some of the other interesting bits we can collect at the same time; note
  590. // that we try to explicitly target the original attribute's data type and
  591. // perform the minimal amount of work necessary on the value to transform
  592. // it to our common structure
  593. #UNIQUENAME(DataInfoRec);
  594. LOCAL %DataInfoRec% := RECORD
  595. %Attribute_t% attribute;
  596. %AttributeType_t% given_attribute_type;
  597. %StringValue_t% string_value;
  598. UNSIGNED4 value_count;
  599. %DataPattern_t% data_pattern;
  600. UNSIGNED4 data_length;
  601. BOOLEAN is_filled;
  602. BOOLEAN is_number;
  603. END;
  604. #UNIQUENAME(dataInfo);
  605. LOCAL %dataInfo% :=
  606. #SET(recLevel, 0)
  607. #SET(fieldStack, '')
  608. #SET(namePrefix, '')
  609. #SET(needsDelim, 0)
  610. #SET(fieldCount, 0)
  611. #FOR(inFileFields)
  612. #FOR(Field)
  613. #IF(%{@isRecord}% = 1)
  614. #SET(fieldStack, 'r' + %'fieldStack'%)
  615. #APPEND(namePrefix, %'@name'% + '.')
  616. #ELSEIF(%{@isDataset}% = 1)
  617. #SET(fieldStack, 'd' + %'fieldStack'%)
  618. #SET(recLevel, %recLevel% + 1)
  619. #ELSEIF(%{@isEnd}% = 1)
  620. #IF(%'fieldStack'%[1] = 'd')
  621. #SET(recLevel, %recLevel% - 1)
  622. #ELSE
  623. #SET(namePrefix, REGEXREPLACE('\\w+\\.$', %'namePrefix'%, ''))
  624. #END
  625. #SET(fieldStack, %'fieldStack'%[2..])
  626. #ELSEIF(%recLevel% = 0)
  627. #IF(%_CanProcessAttribute%(%'namePrefix'% + %'@name'%))
  628. #SET(fieldCount, %fieldCount% + 1)
  629. #IF(%needsDelim% = 1) + #END
  630. IF(EXISTS(_inFile),
  631. PROJECT
  632. (
  633. TABLE
  634. (
  635. _inFile,
  636. {
  637. %Attribute_t% attribute := %'namePrefix'% + %'@name'%,
  638. %AttributeType_t% given_attribute_type := %'@ecltype'%,
  639. %StringValue_t% string_value :=
  640. #IF(%_IsSetType%(%'@type'%))
  641. (%StringValue_t%)Std.Str.CombineWords((SET OF STRING)_inFile.#EXPAND(%'namePrefix'% + %'@name'%), ', ')
  642. #ELSEIF(REGEXFIND('(integer)|(unsigned)|(decimal)|(real)|(boolean)', %'@type'%))
  643. (%StringValue_t%)_inFile.#EXPAND(%'namePrefix'% + %'@name'%)
  644. #ELSEIF(REGEXFIND('string', %'@type'%))
  645. %_TrimmedStr%(_inFile.#EXPAND(%'namePrefix'% + %'@name'%))
  646. #ELSE
  647. %_TrimmedStr%((%StringValue_t%)_inFile.#EXPAND(%'namePrefix'% + %'@name'%))
  648. #END,
  649. UNSIGNED4 value_count := COUNT(GROUP),
  650. %DataPattern_t% data_pattern :=
  651. #IF(%_IsSetType%(%'@type'%))
  652. %_MapAllStr%(%_TrimmedStr%(Std.Str.CombineWords((SET OF STRING)_inFile.#EXPAND(%'namePrefix'% + %'@name'%), ', '))[..%foundMaxPatternLen%])
  653. #ELSEIF(REGEXFIND('(integer)|(unsigned)|(decimal)|(real)', %'@type'%))
  654. %_MapAllStr%((STRING)_inFile.#EXPAND(%'namePrefix'% + %'@name'%))
  655. #ELSEIF(REGEXFIND('(unicode)|(utf)', %'@type'%))
  656. #IF(%@size% < 0 OR (%@size% DIV 2 + 1) > %foundMaxPatternLen%)
  657. %_MapAllUni%(%_TrimmedUni%((UNICODE)_inFile.#EXPAND(%'namePrefix'% + %'@name'%))[..%foundMaxPatternLen%])
  658. #ELSE
  659. %_MapAllUni%(%_TrimmedUni%((UNICODE)_inFile.#EXPAND(%'namePrefix'% + %'@name'%)))
  660. #END
  661. #ELSEIF(REGEXFIND('string', %'@type'%))
  662. #IF(%@size% < 0 OR %@size% > %foundMaxPatternLen%)
  663. %_MapAllStr%(%_TrimmedStr%(_inFile.#EXPAND(%'namePrefix'% + %'@name'%))[..%foundMaxPatternLen%])
  664. #ELSE
  665. %_MapAllStr%(%_TrimmedStr%(_inFile.#EXPAND(%'namePrefix'% + %'@name'%)))
  666. #END
  667. #ELSEIF(%'@type'% = 'boolean')
  668. 'B'
  669. #ELSE
  670. %_MapAllStr%(%_TrimmedStr%((STRING)_inFile.#EXPAND(%'namePrefix'% + %'@name'%))[..%foundMaxPatternLen%])
  671. #END,
  672. UNSIGNED4 data_length :=
  673. #IF(%_IsSetType%(%'@type'%))
  674. COUNT(_inFile.#EXPAND(%'namePrefix'% + %'@name'%))
  675. #ELSEIF(REGEXFIND('(unicode)|(utf)', %'@type'%))
  676. LENGTH(%_TrimmedUni%((UNICODE)_inFile.#EXPAND(%'namePrefix'% + %'@name'%)))
  677. #ELSEIF(REGEXFIND('string', %'@type'%))
  678. LENGTH(%_TrimmedStr%(_inFile.#EXPAND(%'namePrefix'% + %'@name'%)))
  679. #ELSEIF(%'@type'% = 'boolean')
  680. 1
  681. #ELSE
  682. LENGTH((STRING)_inFile.#EXPAND(%'namePrefix'% + %'@name'%))
  683. #END,
  684. BOOLEAN is_filled :=
  685. #IF(%_IsSetType%(%'@type'%))
  686. COUNT(_inFile.#EXPAND(%'namePrefix'% + %'@name'%)) > 0
  687. #ELSEIF(REGEXFIND('(unicode)|(utf)', %'@type'%))
  688. LENGTH(%_TrimmedUni%(_inFile.#EXPAND(%'namePrefix'% + %'@name'%))) > 0
  689. #ELSEIF(REGEXFIND('string', %'@type'%))
  690. LENGTH(%_TrimmedStr%(_inFile.#EXPAND(%'namePrefix'% + %'@name'%))) > 0
  691. #ELSEIF(REGEXFIND('data', %'@type'%))
  692. LENGTH(_inFile.#EXPAND(%'namePrefix'% + %'@name'%)) > 0
  693. #ELSEIF(%'@type'% = 'boolean')
  694. TRUE
  695. #ELSE
  696. _inFile.#EXPAND(%'namePrefix'% + %'@name'%) != 0
  697. #END,
  698. BOOLEAN is_number :=
  699. #IF(%_IsSetType%(%'@type'%))
  700. FALSE
  701. #ELSEIF(REGEXFIND('(integer)|(unsigned)|(decimal)|(real)', %'@type'%))
  702. TRUE
  703. #ELSE
  704. FALSE
  705. #END
  706. },
  707. _inFile.#EXPAND(%'namePrefix'% + %'@name'%),
  708. LOCAL
  709. ),
  710. TRANSFORM(%DataInfoRec%, SELF := LEFT)
  711. ),
  712. DATASET
  713. (
  714. 1,
  715. TRANSFORM
  716. (
  717. %DataInfoRec%,
  718. SELF.attribute := %'namePrefix'% + %'@name'%,
  719. SELF.given_attribute_type := %'@ecltype'%,
  720. SELF := []
  721. )
  722. )
  723. )
  724. #SET(needsDelim, 1)
  725. #END
  726. #END
  727. #END
  728. #END
  729. // Insert empty value for syntax checking
  730. #IF(%fieldCount% = 0)
  731. DATASET([], %DataInfoRec%)
  732. #END;
  733. // Get only those attributes that are filled
  734. #UNIQUENAME(filledDataInfo);
  735. LOCAL %filledDataInfo% := %dataInfo%(is_filled);
  736. // Determine the best ECL data type for each attribute
  737. #UNIQUENAME(DataTypeEnum);
  738. LOCAL %DataTypeEnum% := ENUM
  739. (
  740. UNSIGNED4,
  741. AsIs = 0,
  742. SignedInteger = 1,
  743. UnsignedInteger = 2,
  744. FloatingPoint = 4,
  745. ExpNotation = 8
  746. );
  747. #UNIQUENAME(BestTypeFlag);
  748. LOCAL %DataTypeEnum% %BestTypeFlag%(STRING dataPattern, %AttributeType_t% attributeType) := FUNCTION
  749. isLeadingZeroInteger := REGEXFIND('^0[09]{1,18}$', dataPattern);
  750. isSignedInteger := REGEXFIND('^\\-[09]{1,19}$', dataPattern);
  751. isShortUnsignedInteger := REGEXFIND('^[09]{1,19}$', dataPattern);
  752. isUnsignedInteger := REGEXFIND('^\\+?[09]{1,20}$', dataPattern);
  753. isFloatingPoint := REGEXFIND('^(\\-|\\+)?[09]{0,15}\\.[09]{1,15}$', dataPattern);
  754. isExpNotation := REGEXFIND('^(\\-|\\+)?[09]\\.[09]{1,6}[aA]\\-[09]{1,3}$', dataPattern);
  755. stringWithNumbersType := MAP
  756. (
  757. isSignedInteger => %DataTypeEnum%.SignedInteger | %DataTypeEnum%.FloatingPoint | %DataTypeEnum%.ExpNotation,
  758. isShortUnsignedInteger => %DataTypeEnum%.SignedInteger | %DataTypeEnum%.UnsignedInteger | %DataTypeEnum%.FloatingPoint | %DataTypeEnum%.ExpNotation,
  759. isUnsignedInteger => %DataTypeEnum%.UnsignedInteger | %DataTypeEnum%.FloatingPoint | %DataTypeEnum%.ExpNotation,
  760. isFloatingPoint => %DataTypeEnum%.FloatingPoint | %DataTypeEnum%.ExpNotation,
  761. isExpNotation => %DataTypeEnum%.ExpNotation,
  762. %DataTypeEnum%.AsIs
  763. );
  764. bestType := MAP
  765. (
  766. %_IsSetType%(attributeType) => %DataTypeEnum%.AsIs,
  767. REGEXFIND('(integer)|(unsigned)|(decimal)|(real)|(boolean)', attributeType) => %DataTypeEnum%.AsIs,
  768. isLeadingZeroInteger => %DataTypeEnum%.AsIs,
  769. stringWithNumbersType
  770. );
  771. RETURN bestType;
  772. END;
  773. // Estimate integer size from readable data length
  774. #UNIQUENAME(Len2Size);
  775. LOCAL %Len2Size%(UNSIGNED2 c) := MAP ( c < 3 => 1, c < 5 => 2, c < 7 => 3, c < 9 => 4, c < 11 => 5, c < 14 => 6, c < 16 => 7, 8 );
  776. #UNIQUENAME(attributeTypePatterns);
  777. LOCAL %attributeTypePatterns% := TABLE
  778. (
  779. %filledDataInfo%,
  780. {
  781. attribute,
  782. given_attribute_type,
  783. data_pattern,
  784. data_length,
  785. %DataTypeEnum% type_flag := %BestTypeFlag%(TRIM(data_pattern), given_attribute_type),
  786. UNSIGNED4 min_data_length := 0 // will be populated within %attributesWithTypeFlagsSummary%
  787. },
  788. attribute, given_attribute_type, data_pattern, data_length,
  789. MERGE
  790. );
  791. #UNIQUENAME(MinNotZero);
  792. LOCAL %MinNotZero%(UNSIGNED4 n1, UNSIGNED4 n2) := MAP
  793. (
  794. n1 = 0 => n2,
  795. n2 = 0 => n1,
  796. MIN(n1, n2)
  797. );
  798. #UNIQUENAME(attributesWithTypeFlagsSummary);
  799. LOCAL %attributesWithTypeFlagsSummary% := AGGREGATE
  800. (
  801. %attributeTypePatterns%,
  802. RECORDOF(%attributeTypePatterns%),
  803. TRANSFORM
  804. (
  805. RECORDOF(%attributeTypePatterns%),
  806. SELF.data_length := MAX(LEFT.data_length, RIGHT.data_length),
  807. SELF.min_data_length := %MinNotZero%(LEFT.data_length, RIGHT.data_length),
  808. SELF.type_flag := IF(TRIM(RIGHT.attribute) != '', LEFT.type_flag & RIGHT.type_flag, LEFT.type_flag),
  809. SELF := LEFT
  810. ),
  811. TRANSFORM
  812. (
  813. RECORDOF(%attributeTypePatterns%),
  814. SELF.data_length := MAX(RIGHT1.data_length, RIGHT2.data_length),
  815. SELF.min_data_length := %MinNotZero%(RIGHT1.data_length, RIGHT2.data_length),
  816. SELF.type_flag := RIGHT1.type_flag & RIGHT2.type_flag,
  817. SELF := RIGHT1
  818. ),
  819. LEFT.attribute,
  820. FEW
  821. );
  822. #UNIQUENAME(AttributeTypeRec);
  823. LOCAL %AttributeTypeRec% := RECORD
  824. %Attribute_t% attribute;
  825. %AttributeType_t% given_attribute_type;
  826. %AttributeType_t% best_attribute_type;
  827. END;
  828. #UNIQUENAME(attributeBestTypeInfo);
  829. LOCAL %attributeBestTypeInfo% := PROJECT
  830. (
  831. %attributesWithTypeFlagsSummary%,
  832. TRANSFORM
  833. (
  834. %AttributeTypeRec%,
  835. SELF.best_attribute_type := MAP
  836. (
  837. %_IsSetType%(LEFT.given_attribute_type) => LEFT.given_attribute_type,
  838. REGEXFIND('(integer)|(unsigned)|(decimal)|(real)|(boolean)', LEFT.given_attribute_type) => LEFT.given_attribute_type,
  839. REGEXFIND('data', LEFT.given_attribute_type) => 'data' + IF(LEFT.data_length > 0 AND (LEFT.data_length < (LEFT.min_data_length * 1000)), (STRING)LEFT.data_length, ''),
  840. (LEFT.type_flag & %DataTypeEnum%.UnsignedInteger) != 0 => 'unsigned' + %Len2Size%(LEFT.data_length),
  841. (LEFT.type_flag & %DataTypeEnum%.SignedInteger) != 0 => 'integer' + %Len2Size%(LEFT.data_length),
  842. (LEFT.type_flag & %DataTypeEnum%.FloatingPoint) != 0 => 'real' + IF(LEFT.data_length < 8, '4', '8'),
  843. (LEFT.type_flag & %DataTypeEnum%.ExpNotation) != 0 => 'real8',
  844. REGEXFIND('utf', LEFT.given_attribute_type) => LEFT.given_attribute_type,
  845. REGEXREPLACE('\\d+$', TRIM(LEFT.given_attribute_type), '') + IF(LEFT.data_length > 0 AND (LEFT.data_length < (LEFT.min_data_length * 1000)), (STRING)LEFT.data_length, '')
  846. ),
  847. SELF := LEFT
  848. )
  849. );
  850. #UNIQUENAME(filledDataInfoNumeric);
  851. LOCAL %filledDataInfoNumeric% := JOIN
  852. (
  853. %filledDataInfo%,
  854. %attributeBestTypeInfo%,
  855. LEFT.attribute = RIGHT.attribute,
  856. TRANSFORM
  857. (
  858. RECORDOF(LEFT),
  859. SELF.is_number := LEFT.is_number OR (REGEXFIND('(integer)|(unsigned)|(decimal)|(real)', RIGHT.best_attribute_type) AND NOT REGEXFIND('set of ', RIGHT.best_attribute_type)),
  860. SELF := LEFT
  861. ),
  862. LEFT OUTER, KEEP(1), SMART
  863. ) : ONWARNING(4531, IGNORE);
  864. // Build a set of attributes for quartiles, unique values, and modes for
  865. // each processed attribute
  866. #SET(recLevel, 0);
  867. #SET(fieldStack, '');
  868. #SET(namePrefix, '');
  869. #FOR(inFileFields)
  870. #FOR(Field)
  871. #IF(%{@isRecord}% = 1)
  872. #SET(fieldStack, 'r' + %'fieldStack'%)
  873. #APPEND(namePrefix, %'@name'% + '.')
  874. #ELSEIF(%{@isDataset}% = 1)
  875. #SET(fieldStack, 'd' + %'fieldStack'%)
  876. #SET(recLevel, %recLevel% + 1)
  877. #ELSEIF(%{@isEnd}% = 1)
  878. #IF(%'fieldStack'%[1] = 'd')
  879. #SET(recLevel, %recLevel% - 1)
  880. #ELSE
  881. #SET(namePrefix, REGEXREPLACE('\\w+\\.$', %'namePrefix'%, ''))
  882. #END
  883. #SET(fieldStack, %'fieldStack'%[2..]);
  884. #ELSEIF(%recLevel% = 0)
  885. #IF(%_CanProcessAttribute%(%'namePrefix'% + %'@name'%))
  886. // Note that we create explicit attributes here for all
  887. // top-level attributes in the dataset that we're
  888. // processing, even if they are not numeric datatypes
  889. #UNIQUENAME(uniqueNumericValueCounts)
  890. %uniqueNumericValueCounts% := PROJECT
  891. (
  892. %filledDataInfoNumeric%(attribute = %'namePrefix'% + %'@name'% AND is_number),
  893. TRANSFORM
  894. (
  895. {
  896. REAL value,
  897. UNSIGNED6 cnt,
  898. UNSIGNED6 valueEndPos
  899. },
  900. SELF.value := (REAL)LEFT.string_value,
  901. SELF.cnt := LEFT.value_count,
  902. SELF.valueEndPos := 0
  903. )
  904. );
  905. // Explicit attributes containing scalars
  906. LOCAL #EXPAND(%_MakeAttr%(%'namePrefix'% + %'@name'% + '_min')) := MIN(%uniqueNumericValueCounts%, value);
  907. LOCAL #EXPAND(%_MakeAttr%(%'namePrefix'% + %'@name'% + '_max')) := MAX(%uniqueNumericValueCounts%, value);
  908. LOCAL #EXPAND(%_MakeAttr%(%'namePrefix'% + %'@name'% + '_ave')) := SUM(%uniqueNumericValueCounts%, value * cnt) / SUM(%uniqueNumericValueCounts%, cnt);
  909. LOCAL #EXPAND(%_MakeAttr%(%'namePrefix'% + %'@name'% + '_std_dev')) := SQRT(SUM(%uniqueNumericValueCounts%, (value - #EXPAND(%_MakeAttr%(%'namePrefix'% + %'@name'% + '_ave'))) * (value - #EXPAND(%_MakeAttr%(%'namePrefix'% + %'@name'% + '_ave'))) * cnt) / SUM(%uniqueNumericValueCounts%, cnt));
  910. // Determine the position of the last record in the original
  911. // dataset that contains a particular value
  912. #UNIQUENAME(uniqueNumericValuePos)
  913. %uniqueNumericValuePos% := ITERATE
  914. (
  915. SORT(%uniqueNumericValueCounts%, value, SKEW(1)),
  916. TRANSFORM
  917. (
  918. RECORDOF(LEFT),
  919. SELF.valueEndPos := LEFT.valueEndPos + RIGHT.cnt,
  920. SELF := RIGHT
  921. )
  922. );
  923. // The total number of records in this subset
  924. #UNIQUENAME(wholeNumRecs)
  925. LOCAL %wholeNumRecs% := MAX(%uniqueNumericValuePos%, valueEndPos);
  926. #UNIQUENAME(halfNumRecs);
  927. LOCAL %halfNumRecs% := %wholeNumRecs% DIV 2;
  928. // Find the median
  929. #UNIQUENAME(q2Pos1);
  930. LOCAL %q2Pos1% := %halfNumRecs% + (%wholeNumRecs% % 2);
  931. #UNIQUENAME(q2Value1);
  932. LOCAL %q2Value1% := MIN(%uniqueNumericValuePos%(valueEndPos >= %q2Pos1%), value);
  933. #UNIQUENAME(q2Pos2);
  934. LOCAL %q2Pos2% := %q2Pos1% + ((%wholeNumRecs% + 1) % 2);
  935. #UNIQUENAME(q2Value2);
  936. LOCAL %q2Value2% := MIN(%uniqueNumericValuePos%(valueEndPos >= %q2Pos2%), value);
  937. LOCAL #EXPAND(%_MakeAttr%(%'namePrefix'% + %'@name'% + '_q2_value')) := AVE(%q2Value1%, %q2Value2%);
  938. // Find the lower quartile
  939. #UNIQUENAME(q1Pos1);
  940. LOCAL %q1Pos1% := (%halfNumRecs% DIV 2) + (%halfNumRecs% % 2);
  941. #UNIQUENAME(q1Value1);
  942. LOCAL %q1Value1% := MIN(%uniqueNumericValuePos%(valueEndPos >= %q1Pos1%), value);
  943. #UNIQUENAME(q1Pos2);
  944. LOCAL %q1Pos2% := %q1Pos1% + ((%halfNumRecs% + 1) % 2);
  945. #UNIQUENAME(q1Value2);
  946. LOCAL %q1Value2% := MIN(%uniqueNumericValuePos%(valueEndPos >= %q1Pos2%), value);
  947. LOCAL #EXPAND(%_MakeAttr%(%'namePrefix'% + %'@name'% + '_q1_value')) := IF(%halfNumRecs% > 0, AVE(%q1Value1%, %q1Value2%), 0);
  948. // Find the upper quartile
  949. #UNIQUENAME(q3Pos1);
  950. LOCAL %q3Pos1% := MAX(%q2Pos1%, %q2Pos2%) + (%halfNumRecs% DIV 2) + (%halfNumRecs% % 2);
  951. #UNIQUENAME(q3Value1);
  952. LOCAL %q3Value1% := MIN(%uniqueNumericValuePos%(valueEndPos >= %q3Pos1%), value);
  953. #UNIQUENAME(q3Pos2);
  954. LOCAL %q3Pos2% := %q3Pos1% - ((%halfNumRecs% + 1) % 2);
  955. #UNIQUENAME(q3Value2);
  956. LOCAL %q3Value2% := MIN(%uniqueNumericValuePos%(valueEndPos >= %q3Pos2%), value);
  957. LOCAL #EXPAND(%_MakeAttr%(%'namePrefix'% + %'@name'% + '_q3_value')) := IF(%halfNumRecs% > 0, AVE(%q3Value1%, %q3Value2%), 0);
  958. // Derive all unique data values and the number of times
  959. // each occurs in the data
  960. LOCAL #EXPAND(%_MakeAttr%(%'namePrefix'% + %'@name'% + '_uniq_value_recs')) := TABLE
  961. (
  962. %filledDataInfoNumeric%(attribute = %'namePrefix'% + %'@name'%),
  963. {
  964. string_value,
  965. UNSIGNED4 rec_count := SUM(GROUP, value_count)
  966. },
  967. string_value,
  968. MERGE
  969. );
  970. // Find the mode of the (string) data; using a JOIN here
  971. // to avoid the 10MB limit error that sometimes occurs
  972. // when you use filters to find a single value; also note
  973. // the TOPN calls to reduce the search space, which also
  974. // effectively limit the final result to _maxModes records
  975. #UNIQUENAME(topRecords);
  976. %topRecords% := TOPN(#EXPAND(%_MakeAttr%(%'namePrefix'% + %'@name'% + '_uniq_value_recs')), _maxModes, -rec_count);
  977. #UNIQUENAME(topRecord)
  978. %topRecord% := TOPN(%topRecords%, 1, -rec_count);
  979. LOCAL #EXPAND(%_MakeAttr%(%'namePrefix'% + %'@name'% + '_mode_values')) := JOIN
  980. (
  981. %topRecords%,
  982. %topRecord%,
  983. LEFT.rec_count = RIGHT.rec_count,
  984. TRANSFORM
  985. (
  986. ModeRec,
  987. SELF.value := LEFT.string_value,
  988. SELF.rec_count := LEFT.rec_count
  989. ),
  990. SMART
  991. ) : ONWARNING(4531, IGNORE);
  992. // Get records with low cardinality
  993. LOCAL #EXPAND(%_MakeAttr%(%'namePrefix'% + %'@name'% + '_lcb_recs')) := IF
  994. (
  995. COUNT(#EXPAND(%_MakeAttr%(%'namePrefix'% + %'@name'% + '_uniq_value_recs'))) <= _lcbLimit,
  996. PROJECT
  997. (
  998. SORT(#EXPAND(%_MakeAttr%(%'namePrefix'% + %'@name'% + '_uniq_value_recs')), -rec_count),
  999. TRANSFORM
  1000. (
  1001. ModeRec,
  1002. SELF.value := LEFT.string_value,
  1003. SELF.rec_count := LEFT.rec_count
  1004. )
  1005. ),
  1006. DATASET([], ModeRec)
  1007. );
  1008. #END
  1009. #END
  1010. #END
  1011. #END
  1012. // Run correlations on all unique pairs of numeric fields in the data
  1013. #UNIQUENAME(BaseCorrelationLayout);
  1014. LOCAL %BaseCorrelationLayout% := RECORD
  1015. %Attribute_t% attribute_x;
  1016. %Attribute_t% attribute_y;
  1017. REAL corr;
  1018. END;
  1019. #UNIQUENAME(corrNamePosX);
  1020. #UNIQUENAME(corrNamePosY);
  1021. #UNIQUENAME(fieldX);
  1022. #UNIQUENAME(fieldY);
  1023. #SET(needsDelim, 0);
  1024. #UNIQUENAME(correlations0);
  1025. LOCAL %correlations0% := DATASET
  1026. (
  1027. [
  1028. #SET(corrNamePosX, 1)
  1029. #LOOP
  1030. #SET(fieldX, REGEXFIND('^([^,]+)', %'numericFields'%[%corrNamePosX%..], 1))
  1031. #IF(%'fieldX'% != '')
  1032. #SET(corrNamePosY, %corrNamePosX% + LENGTH(%'fieldX'%) + 1)
  1033. #LOOP
  1034. #SET(fieldY, REGEXFIND('^([^,]+)', %'numericFields'%[%corrNamePosY%..], 1))
  1035. #IF(%'fieldY'% != '')
  1036. #IF(%needsDelim% = 1) , #END
  1037. {
  1038. %'fieldX'%,
  1039. %'fieldY'%,
  1040. CORRELATION(_inFile, _inFile.%fieldX%, _inFile.%fieldY%)
  1041. }
  1042. #SET(needsDelim, 1)
  1043. #SET(corrNamePosY, %corrNamePosY% + LENGTH(%'fieldY'%) + 1)
  1044. #ELSE
  1045. #BREAK
  1046. #END
  1047. #END
  1048. #SET(corrNamePosX, %corrNamePosX% + LENGTH(%'fieldX'%) + 1)
  1049. #ELSE
  1050. #BREAK
  1051. #END
  1052. #END
  1053. ],
  1054. %BaseCorrelationLayout%
  1055. );
  1056. // Append a duplicate of the correlations to itself with the X and Y fields
  1057. // reversed so we can easily merge results on a per-attribute basis later
  1058. #UNIQUENAME(correlations);
  1059. LOCAL %correlations% := %correlations0% + PROJECT
  1060. (
  1061. %correlations0%,
  1062. TRANSFORM
  1063. (
  1064. RECORDOF(LEFT),
  1065. SELF.attribute_x := LEFT.attribute_y,
  1066. SELF.attribute_y := LEFT.attribute_x,
  1067. SELF := LEFT
  1068. )
  1069. );
  1070. // Create a small dataset that specifies the output order of the named
  1071. // attributes (which should be the same as the input order)
  1072. #UNIQUENAME(resultOrderDS);
  1073. LOCAL %resultOrderDS% := DATASET
  1074. (
  1075. [
  1076. #SET(needsDelim, 0)
  1077. #SET(corrNamePosX, 1)
  1078. #SET(fieldY, 1)
  1079. #LOOP
  1080. #SET(fieldX, REGEXFIND('^([^,]+)', %'explicitFields'%[%corrNamePosX%..], 1))
  1081. #IF(%'fieldX'% != '')
  1082. #IF(%needsDelim% = 1) , #END
  1083. {%fieldY%, %'fieldX'%}
  1084. #SET(needsDelim, 1)
  1085. #SET(corrNamePosX, %corrNamePosX% + LENGTH(%'fieldX'%) + 1)
  1086. #SET(fieldY, %fieldY% + 1)
  1087. #ELSE
  1088. #BREAK
  1089. #END
  1090. #END
  1091. ],
  1092. {
  1093. UNSIGNED2 nameOrder,
  1094. %Attribute_t% attrName
  1095. }
  1096. );
  1097. //--------------------------------------------------------------------------
  1098. // Collect individual stats for each attribute; these are grouped by the
  1099. // criteria used to group them
  1100. //--------------------------------------------------------------------------
  1101. // Count data patterns used per attribute; extract the most common and
  1102. // most rare, taking care to not allow the two to overlap; we will
  1103. // replace the '0' character left in from the pattern generation with
  1104. // a '9' character to make the numeric pattern complete
  1105. #UNIQUENAME(dataPatternStats0);
  1106. LOCAL %dataPatternStats0% := PROJECT
  1107. (
  1108. %filledDataInfoNumeric%,
  1109. TRANSFORM
  1110. (
  1111. RECORDOF(LEFT),
  1112. SELF.data_pattern := Std.Str.FindReplace(LEFT.data_pattern, '0', '9'),
  1113. SELF := LEFT
  1114. )
  1115. );
  1116. #UNIQUENAME(dataPatternStats);
  1117. LOCAL %dataPatternStats% := TABLE
  1118. (
  1119. %dataPatternStats0%,
  1120. {
  1121. attribute,
  1122. data_pattern,
  1123. STRING example := string_value[..%foundMaxPatternLen%],
  1124. UNSIGNED4 rec_count := SUM(GROUP, value_count)
  1125. },
  1126. attribute, data_pattern,
  1127. MERGE
  1128. ) : ONWARNING(2168, IGNORE);
  1129. #UNIQUENAME(groupedDataPatterns);
  1130. LOCAL %groupedDataPatterns% := GROUP(SORT(DISTRIBUTE(%dataPatternStats%, HASH32(attribute)), attribute, LOCAL), attribute, LOCAL);
  1131. #UNIQUENAME(topDataPatterns);
  1132. LOCAL %topDataPatterns% := UNGROUP(TOPN(%groupedDataPatterns%, (UNSIGNED)_maxPatterns, -rec_count, data_pattern));
  1133. #UNIQUENAME(rareDataPatterns0);
  1134. LOCAL %rareDataPatterns0% := UNGROUP(TOPN(%groupedDataPatterns%, (UNSIGNED)_maxPatterns, rec_count, data_pattern));
  1135. #UNIQUENAME(rareDataPatterns);
  1136. LOCAL %rareDataPatterns% := JOIN
  1137. (
  1138. %rareDataPatterns0%,
  1139. %topDataPatterns%,
  1140. LEFT.attribute = RIGHT.attribute AND LEFT.data_pattern = RIGHT.data_pattern,
  1141. TRANSFORM(LEFT),
  1142. LEFT ONLY
  1143. ) : ONWARNING(4531, IGNORE);
  1144. // Find min, max and average data lengths per attribute
  1145. #UNIQUENAME(dataLengthStats);
  1146. LOCAL %dataLengthStats% := TABLE
  1147. (
  1148. %filledDataInfoNumeric%,
  1149. {
  1150. attribute,
  1151. UNSIGNED4 min_length := MIN(GROUP, data_length),
  1152. UNSIGNED4 max_length := MAX(GROUP, data_length),
  1153. UNSIGNED4 ave_length := SUM(GROUP, data_length * value_count) / SUM(GROUP, value_count)
  1154. },
  1155. attribute,
  1156. MERGE
  1157. );
  1158. // Count attribute fill rates per attribute; will be turned into
  1159. // percentages later
  1160. #UNIQUENAME(dataFilledStats);
  1161. LOCAL %dataFilledStats% := TABLE
  1162. (
  1163. %dataInfo%,
  1164. {
  1165. attribute,
  1166. given_attribute_type,
  1167. UNSIGNED4 rec_count := SUM(GROUP, value_count),
  1168. UNSIGNED4 filled_count := SUM(GROUP, IF(is_filled, value_count, 0))
  1169. },
  1170. attribute, given_attribute_type,
  1171. MERGE
  1172. );
  1173. // Compute the cardinality and pull in previously-computed explicit
  1174. // attribute values at the same time
  1175. #UNIQUENAME(cardinalityAndNumerics);
  1176. LOCAL %cardinalityAndNumerics% := DATASET
  1177. (
  1178. [
  1179. #SET(recLevel, 0)
  1180. #SET(fieldStack, '')
  1181. #SET(namePrefix, '')
  1182. #SET(needsDelim, 0)
  1183. #FOR(inFileFields)
  1184. #FOR(Field)
  1185. #IF(%{@isRecord}% = 1)
  1186. #SET(fieldStack, 'r' + %'fieldStack'%)
  1187. #APPEND(namePrefix, %'@name'% + '.')
  1188. #ELSEIF(%{@isDataset}% = 1)
  1189. #SET(fieldStack, 'd' + %'fieldStack'%)
  1190. #SET(recLevel, %recLevel% + 1)
  1191. #ELSEIF(%{@isEnd}% = 1)
  1192. #IF(%'fieldStack'%[1] = 'd')
  1193. #SET(recLevel, %recLevel% - 1)
  1194. #ELSE
  1195. #SET(namePrefix, REGEXREPLACE('\\w+\\.$', %'namePrefix'%, ''))
  1196. #END
  1197. #SET(fieldStack, %'fieldStack'%[2..])
  1198. #ELSEIF(%recLevel% = 0)
  1199. #IF(%_CanProcessAttribute%(%'namePrefix'% + %'@name'%))
  1200. #IF(%needsDelim% = 1) , #END
  1201. {
  1202. %'namePrefix'% + %'@name'%,
  1203. #IF(%_IsSetType%(%'@type'%))
  1204. FALSE,
  1205. #ELSEIF(REGEXFIND('(integer)|(unsigned)|(decimal)|(real)', %'@type'%))
  1206. TRUE,
  1207. #ELSE
  1208. FALSE,
  1209. #END
  1210. #IF(%FeatureEnabledCardinality%())
  1211. COUNT(#EXPAND(%_MakeAttr%(%'namePrefix'% + %'@name'% + '_uniq_value_recs'))),
  1212. #ELSE
  1213. 0,
  1214. #END
  1215. #IF(%FeatureEnabledMinMax%())
  1216. #EXPAND(%_MakeAttr%(%'namePrefix'% + %'@name'% + '_min')),
  1217. #EXPAND(%_MakeAttr%(%'namePrefix'% + %'@name'% + '_max')),
  1218. #ELSE
  1219. 0,
  1220. 0,
  1221. #END
  1222. #IF(%FeatureEnabledMean%())
  1223. #EXPAND(%_MakeAttr%(%'namePrefix'% + %'@name'% + '_ave')),
  1224. #ELSE
  1225. 0,
  1226. #END
  1227. #IF(%FeatureEnabledStdDev%())
  1228. #EXPAND(%_MakeAttr%(%'namePrefix'% + %'@name'% + '_std_dev')),
  1229. #ELSE
  1230. 0,
  1231. #END
  1232. #IF(%FeatureEnabledQuartiles%())
  1233. #EXPAND(%_MakeAttr%(%'namePrefix'% + %'@name'% + '_q1_value')),
  1234. #EXPAND(%_MakeAttr%(%'namePrefix'% + %'@name'% + '_q2_value')),
  1235. #EXPAND(%_MakeAttr%(%'namePrefix'% + %'@name'% + '_q3_value')),
  1236. #ELSE
  1237. 0,
  1238. 0,
  1239. 0,
  1240. #END
  1241. #IF(%FeatureEnabledModes%())
  1242. #EXPAND(%_MakeAttr%(%'namePrefix'% + %'@name'% + '_mode_values'))(rec_count > 1), // Modes must have more than one instance
  1243. #ELSE
  1244. DATASET([], ModeRec),
  1245. #END
  1246. #IF(%FeatureEnabledLowCardinalityBreakdown%())
  1247. #EXPAND(%_MakeAttr%(%'namePrefix'% + %'@name'% + '_lcb_recs'))
  1248. #ELSE
  1249. DATASET([], ModeRec)
  1250. #END
  1251. }
  1252. #SET(needsDelim, 1)
  1253. #END
  1254. #END
  1255. #END
  1256. #END
  1257. ],
  1258. {
  1259. %Attribute_t% attribute,
  1260. BOOLEAN is_numeric,
  1261. UNSIGNED4 cardinality,
  1262. REAL numeric_min,
  1263. REAL numeric_max,
  1264. REAL numeric_mean,
  1265. REAL numeric_std_dev,
  1266. REAL numeric_lower_quartile,
  1267. REAL numeric_median,
  1268. REAL numeric_upper_quartile,
  1269. DATASET(ModeRec) modes;
  1270. DATASET(ModeRec) cardinality_breakdown;
  1271. }
  1272. );
  1273. //--------------------------------------------------------------------------
  1274. // Collect the individual results into a single output dataset
  1275. //--------------------------------------------------------------------------
  1276. #UNIQUENAME(final10);
  1277. LOCAL %final10% := PROJECT
  1278. (
  1279. %dataFilledStats%,
  1280. TRANSFORM
  1281. (
  1282. _resultLayout,
  1283. SELF.attribute := TRIM(LEFT.attribute, RIGHT),
  1284. SELF.given_attribute_type := TRIM(LEFT.given_attribute_type, RIGHT),
  1285. SELF.rec_count := LEFT.rec_count,
  1286. SELF.fill_rate := #IF(%FeatureEnabledFillRate%()) LEFT.filled_count / LEFT.rec_count * 100 #ELSE 0 #END,
  1287. SELF.fill_count := #IF(%FeatureEnabledFillRate%()) LEFT.filled_count #ELSE 0 #END,
  1288. SELF := []
  1289. )
  1290. );
  1291. #UNIQUENAME(final15);
  1292. LOCAL %final15% :=
  1293. #IF(%FeatureEnabledBestECLTypes%())
  1294. JOIN
  1295. (
  1296. %final10%,
  1297. %attributeBestTypeInfo%,
  1298. LEFT.attribute = RIGHT.attribute,
  1299. TRANSFORM
  1300. (
  1301. RECORDOF(LEFT),
  1302. SELF.best_attribute_type := IF(TRIM(RIGHT.best_attribute_type, RIGHT) != '', TRIM(RIGHT.best_attribute_type, RIGHT), LEFT.given_attribute_type),
  1303. SELF := LEFT
  1304. ),
  1305. LEFT OUTER
  1306. ) : ONWARNING(4531, IGNORE)
  1307. #ELSE
  1308. %final10%
  1309. #END;
  1310. #UNIQUENAME(final20);
  1311. LOCAL %final20% :=
  1312. #IF(%FeatureEnabledLengths%())
  1313. JOIN
  1314. (
  1315. %final15%,
  1316. %dataLengthStats%,
  1317. LEFT.attribute = RIGHT.attribute,
  1318. TRANSFORM
  1319. (
  1320. RECORDOF(LEFT),
  1321. SELF.attribute := LEFT.attribute,
  1322. SELF := RIGHT,
  1323. SELF := LEFT
  1324. ),
  1325. LEFT OUTER, KEEP(1), SMART
  1326. ) : ONWARNING(4531, IGNORE)
  1327. #ELSE
  1328. %final15%
  1329. #END;
  1330. #UNIQUENAME(final30);
  1331. LOCAL %final30% :=
  1332. #IF(%FeatureEnabledCardinality%() OR %FeatureEnabledLowCardinalityBreakdown%() OR %FeatureEnabledMinMax%() OR %FeatureEnabledMean%() OR %FeatureEnabledStdDev%() OR %FeatureEnabledQuartiles%() OR %FeatureEnabledModes%())
  1333. JOIN
  1334. (
  1335. %final20%,
  1336. %cardinalityAndNumerics%,
  1337. LEFT.attribute = RIGHT.attribute,
  1338. TRANSFORM
  1339. (
  1340. RECORDOF(LEFT),
  1341. SELF.attribute := LEFT.attribute,
  1342. SELF := RIGHT,
  1343. SELF := LEFT
  1344. ),
  1345. LEFT OUTER, KEEP(1), SMART
  1346. ) : ONWARNING(4531, IGNORE)
  1347. #ELSE
  1348. %final20%
  1349. #END;
  1350. #UNIQUENAME(final35);
  1351. LOCAL %final35% := JOIN
  1352. (
  1353. %final30%,
  1354. %attributeBestTypeInfo%,
  1355. LEFT.attribute = RIGHT.attribute,
  1356. TRANSFORM
  1357. (
  1358. RECORDOF(LEFT),
  1359. SELF.is_numeric := LEFT.is_numeric OR (REGEXFIND('(integer)|(unsigned)|(decimal)|(real)', RIGHT.best_attribute_type) AND NOT REGEXFIND('set of ', RIGHT.best_attribute_type)),
  1360. SELF := LEFT
  1361. ),
  1362. LEFT OUTER, KEEP(1), SMART
  1363. ) : ONWARNING(4531, IGNORE);
  1364. #UNIQUENAME(final40);
  1365. LOCAL %final40% :=
  1366. #IF(%FeatureEnabledPatterns%())
  1367. DENORMALIZE
  1368. (
  1369. %final35%,
  1370. %topDataPatterns%,
  1371. LEFT.attribute = RIGHT.attribute,
  1372. GROUP,
  1373. TRANSFORM
  1374. (
  1375. RECORDOF(LEFT),
  1376. SELF.popular_patterns := SORT(PROJECT(ROWS(RIGHT), TRANSFORM(PatternCountRec, SELF := LEFT)), -rec_count, data_pattern),
  1377. SELF := LEFT
  1378. ),
  1379. LEFT OUTER, SMART
  1380. ) : ONWARNING(4531, IGNORE)
  1381. #ELSE
  1382. %final35%
  1383. #END;
  1384. #UNIQUENAME(final50);
  1385. LOCAL %final50% :=
  1386. #IF(%FeatureEnabledPatterns%())
  1387. DENORMALIZE
  1388. (
  1389. %final40%,
  1390. %rareDataPatterns%,
  1391. LEFT.attribute = RIGHT.attribute,
  1392. GROUP,
  1393. TRANSFORM
  1394. (
  1395. RECORDOF(LEFT),
  1396. SELF.rare_patterns := SORT(PROJECT(ROWS(RIGHT), TRANSFORM(PatternCountRec, SELF := LEFT)), rec_count, data_pattern),
  1397. SELF := LEFT
  1398. ),
  1399. LEFT OUTER, SMART
  1400. ) : ONWARNING(4531, IGNORE)
  1401. #ELSE
  1402. %final40%
  1403. #END;
  1404. #UNIQUENAME(final60);
  1405. LOCAL %final60% :=
  1406. #IF(%FeatureEnabledCorrelations%())
  1407. DENORMALIZE
  1408. (
  1409. %final50%,
  1410. %correlations%,
  1411. LEFT.attribute = RIGHT.attribute_x,
  1412. GROUP,
  1413. TRANSFORM
  1414. (
  1415. RECORDOF(LEFT),
  1416. SELF.correlations := SORT
  1417. (
  1418. PROJECT
  1419. (
  1420. ROWS(RIGHT),
  1421. TRANSFORM
  1422. (
  1423. CorrelationRec,
  1424. SELF.attribute := LEFT.attribute_y,
  1425. SELF.corr := LEFT.corr
  1426. )
  1427. ),
  1428. -corr
  1429. ),
  1430. SELF := LEFT
  1431. ),
  1432. LEFT OUTER, SMART
  1433. )
  1434. #ELSE
  1435. %final50%
  1436. #END;
  1437. // Append the attribute order to the results; we will sort on the order
  1438. // when creating the final output
  1439. #UNIQUENAME(final70);
  1440. LOCAL %final70% := JOIN
  1441. (
  1442. %final60%,
  1443. %resultOrderDS%,
  1444. TRIM(LEFT.attribute, LEFT, RIGHT) = RIGHT.attrName,
  1445. TRANSFORM
  1446. (
  1447. RECORDOF(LEFT),
  1448. SELF.sortValue := _sortPrefix + INTFORMAT(RIGHT.nameOrder, 5, 1),
  1449. SELF.attribute := _attrNamePrefix + LEFT.attribute,
  1450. SELF := LEFT
  1451. )
  1452. ) : ONWARNING(4531, IGNORE);
  1453. RETURN #IF(%fieldCount% > 0) %final70% #ELSE DATASET([], _resultLayout) #END;
  1454. ENDMACRO;
  1455. //==========================================================================
  1456. // Call _Inner_Profile() with the given input dataset top-level scalar attributes,
  1457. // then again for each child dataset that has been found; combine the
  1458. // results of all the calls
  1459. #UNIQUENAME(collectedResults);
  1460. LOCAL %collectedResults% :=
  1461. #IF(%'explicitScalarFields'% != '')
  1462. _Inner_Profile
  1463. (
  1464. GLOBAL(%distributedInFile%),
  1465. %'explicitScalarFields'%,
  1466. maxPatterns,
  1467. maxPatternLen,
  1468. %lowCardinalityThreshold%,
  1469. %MAX_MODES%,
  1470. OutputLayout,
  1471. '',
  1472. ''
  1473. )
  1474. #ELSE
  1475. DATASET([], OutputLayout)
  1476. #END
  1477. #UNIQUENAME(dsNameValue)
  1478. #SET(namePos, 1)
  1479. #LOOP
  1480. #SET(dsNameValue, REGEXFIND('^([^,]+)', %'childDSFields'%[%namePos%..], 1))
  1481. #IF(%'dsNameValue'% != '')
  1482. #SET(numValue, REGEXFIND('^(\\d+):', %'dsNameValue'%, 1))
  1483. #SET(nameValue, REGEXFIND(':([^:]+)$', %'dsNameValue'%, 1))
  1484. // The child dataset should have been extracted into its own
  1485. // local attribute; reference it during our call to the inner
  1486. // profile function macro
  1487. #SET(temp, #MANGLE(%'nameValue'%))
  1488. + _Inner_Profile
  1489. (
  1490. GLOBAL(%temp%),
  1491. '',
  1492. maxPatterns,
  1493. maxPatternLen,
  1494. %lowCardinalityThreshold%,
  1495. %MAX_MODES%,
  1496. OutputLayout,
  1497. %'nameValue'% + '.',
  1498. INTFORMAT(%numValue%, 5, 1) + '.'
  1499. )
  1500. #SET(namePos, %namePos% + LENGTH(%'dsNameValue'%) + 1)
  1501. #ELSE
  1502. #BREAK
  1503. #END
  1504. #END;
  1505. // Put the combined _Inner_Profile() results in the right order and layout
  1506. #UNIQUENAME(finalData);
  1507. LOCAL %finalData% := PROJECT(SORT(%collectedResults%, sortValue), %FinalOutputLayout%);
  1508. RETURN %finalData%;
  1509. ENDMACRO;