Profile.ecl 82 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560
  1. /***
  2. * Function macro for profiling all or part of a dataset. The output is a
  3. * dataset containing the following information for each profiled attribute:
  4. *
  5. * attribute The name of the attribute
  6. * given_attribute_type The ECL type of the attribute as it was defined
  7. * in the input dataset
  8. * best_attribute_type An ECL data type that both allows all values
  9. * in the input dataset and consumes the least
  10. * amount of memory
  11. * rec_count The number of records analyzed in the dataset;
  12. * this may be fewer than the total number of
  13. * records, if the optional sampleSize argument
  14. * was provided with a value less than 100
  15. * fill_count The number of rec_count records containing
  16. * non-nil values; a 'nil value' is an empty
  17. * string, a numeric zero, or an empty SET; note
  18. * that BOOLEAN attributes are always counted as
  19. * filled, regardless of their value; also,
  20. * fixed-length DATA attributes (e.g. DATA10) are
  21. * also counted as filled, given their typical
  22. * function of holding data blobs
  23. * fill_rate The percentage of rec_count records containing
  24. * non-nil values; this is basically
  25. * fill_count / rec_count * 100
  26. * cardinality The number of unique, non-nil values within
  27. * the attribute
  28. * cardinality_breakdown For those attributes with a low number of
  29. * unique, non-nil values, show each value and the
  30. * number of records containing that value; the
  31. * lcbLimit parameter governs what "low number"
  32. * means
  33. * modes The most common values in the attribute, after
  34. * coercing all values to STRING, along with the
  35. * number of records in which the values were
  36. * found; if no value is repeated more than once
  37. * then no mode will be shown; up to five (5)
  38. * modes will be shown; note that string values
  39. * longer than the maxPatternLen argument will
  40. * be truncated
  41. * min_length For SET datatypes, the fewest number of elements
  42. * found in the set; for other data types, the
  43. * shortest length of a value when expressed
  44. * as a string; null values are ignored
  45. * max_length For SET datatypes, the largest number of elements
  46. * found in the set; for other data types, the
  47. * longest length of a value when expressed
  48. * as a string; null values are ignored
  49. * ave_length For SET datatypes, the average number of elements
  50. * found in the set; for other data types, the
  51. * average length of a value when expressed
  52. * as a string; null values are ignored
  53. * popular_patterns The most common patterns of values; see below
  54. * rare_patterns The least common patterns of values; see below
  55. * is_numeric Boolean indicating if the original attribute
  56. * was a numeric scalar and therefore whether or
  57. * not the numeric_xxxx output fields will be
  58. * populated with actual values; if this value
  59. * is FALSE then all numeric_xxxx output values
  60. * should be ignored
  61. * numeric_min The smallest non-nil value found within the
  62. * attribute as a DECIMAL; the attribute must be
  63. * a numeric ECL datatype; non-numeric attributes
  64. * will return zero
  65. * numeric_max The largest non-nil value found within the
  66. * attribute as a DECIMAL; the attribute must be
  67. * a numeric ECL datatype; non-numeric attributes
  68. * will return zero
  69. * numeric_mean The mean (average) non-nil value found within
  70. * the attribute as a DECIMAL; the attribute must
  71. * be a numeric ECL datatype; non-numeric
  72. * attributes will return zero
  73. * numeric_std_dev The standard deviation of the non-nil values
  74. * in the attribute as a DECIMAL; the attribute
  75. * must be a numeric ECL datatype; non-numeric
  76. * attributes will return zero
  77. * numeric_lower_quartile The value separating the first (bottom) and
  78. * second quarters of non-nil values within
  79. * the attribute as a DECIMAL; the attribute must
  80. * be a numeric ECL datatype; non-numeric
  81. * attributes will return zero
  82. * numeric_median The median non-nil value within the attribute
  83. * as a DECIMAL; the attribute must be a numeric
  84. * ECL datatype; non-numeric attributes will return
  85. * zero
  86. * numeric_upper_quartile The value separating the third and fourth
  87. * (top) quarters of non-nil values within
  88. * the attribute as a DECIMAL; the attribute must
  89. * be a numeric ECL datatype; non-numeric
  90. * attributes will return zero
  91. * numeric_correlations A child dataset containing correlation values
  92. * comparing the current numeric attribute with all
  93. * other numeric attributes, listed in descending
  94. * correlation value order; the attribute must be
  95. * a numeric ECL datatype; non-numeric attributes
  96. * will return an empty child dataset; note that
  97. * this can be a time-consuming operation,
  98. * depending on the number of numeric attributes
  99. * in your dataset and the number of rows (if you
  100. * have N numeric attributes, then
  101. * N * (N - 1) / 2 calculations are performed,
  102. * each scanning all data rows)
  103. *
  104. * Most profile outputs can be disabled. See the 'features' argument, below.
  105. *
  106. * Data patterns can give you an idea of what your data looks like when it is
  107. * expressed as a (human-readable) string. The function converts each
  108. * character of the string into a fixed character palette to producing a "data
  109. * pattern" and then counts the number of unique patterns for that attribute.
  110. * The most- and least-popular patterns from the data will be shown in the
  111. * output, along with the number of times that pattern appears and an example
  112. * (randomly chosen from the actual data). The character palette used is:
  113. *
  114. * A Any uppercase letter
  115. * a Any lowercase letter
  116. * 9 Any numeric digit
  117. * B A boolean value (true or false)
  118. *
  119. * All other characters are left as-is in the pattern.
  120. *
  121. * Function parameters:
  122. *
  123. * @param inFile The dataset to process; this could be a child
  124. * dataset (e.g. inFile.childDS); REQUIRED
  125. * @param fieldListStr A string containing a comma-delimited list of
  126. * attribute names to process; note that attributes
  127. * listed here must be scalar datatypes (not child
  128. * records or child datasets); use an empty string to
  129. * process all attributes in inFile; OPTIONAL,
  130. * defaults to an empty string
  131. * @param maxPatterns The maximum number of patterns (both popular and
  132. * rare) to return for each attribute; OPTIONAL,
  133. * defaults to 100
  134. * @param maxPatternLen The maximum length of a pattern; longer patterns
  135. * are truncated in the output; this value is also
  136. * used to set the maximum length of the data to
  137. * consider when finding cardinality and mode values;
  138. * must be 33 or larger; OPTIONAL, defaults to 100
  139. * @param features A comma-delimited string listing the profiling
  140. * elements to be included in the output; OPTIONAL,
  141. * defaults to a comma-delimited string containing all
  142. * of the available keywords:
  143. * KEYWORD AFFECTED OUTPUT
  144. * fill_rate fill_rate
  145. * fill_count
  146. * cardinality cardinality
  147. * cardinality_breakdown cardinality_breakdown
  148. * best_ecl_types best_attribute_type
  149. * modes modes
  150. * lengths min_length
  151. * max_length
  152. * ave_length
  153. * patterns popular_patterns
  154. * rare_patterns
  155. * min_max numeric_min
  156. * numeric_max
  157. * mean numeric_mean
  158. * std_dev numeric_std_dev
  159. * quartiles numeric_lower_quartile
  160. * numeric_median
  161. * numeric_upper_quartile
  162. * correlations numeric_correlations
  163. * To omit the output associated with a single keyword,
  164. * set this argument to a comma-delimited string
  165. * containing all other keywords; note that the
  166. * is_numeric output will appear only if min_max,
  167. * mean, std_dev, quartiles, or correlations features
  168. * are active; also note that enabling the
  169. * cardinality_breakdown feature will also enable
  170. * the cardinality feature, even if it is not
  171. * explicitly enabled
  172. * @param sampleSize A positive integer representing a percentage of
  173. * inFile to examine, which is useful when analyzing a
  174. * very large dataset and only an estimated data
  175. * profile is sufficient; valid range for this
  176. * argument is 1-100; values outside of this range
  177. * will be clamped; OPTIONAL, defaults to 100 (which
  178. * indicates that the entire dataset will be analyzed)
  179. * @param lcbLimit A positive integer (<= 500) indicating the maximum
  180. * cardinality allowed for an attribute in order to
  181. * emit a breakdown of the attribute's values; this
  182. * parameter will be ignored if cardinality_breakdown
  183. * is not included in the features argument; OPTIONAL,
  184. * defaults to 64
  185. */
  186. EXPORT Profile(inFile,
  187. fieldListStr = '\'\'',
  188. maxPatterns = 100,
  189. maxPatternLen = 100,
  190. features = '\'fill_rate,best_ecl_types,cardinality,cardinality_breakdown,modes,lengths,patterns,min_max,mean,std_dev,quartiles,correlations\'',
  191. sampleSize = 100,
  192. lcbLimit = 64) := FUNCTIONMACRO
  193. LOADXML('<xml/>');
  194. #UNIQUENAME(temp); // Ubiquitous "contains random things"
  195. #UNIQUENAME(scalarFields); // Contains a delimited list of scalar attributes (full names) along with their indexed positions
  196. #UNIQUENAME(explicitScalarFields); // Contains a delimited list of scalar attributes (full names) without indexed positions
  197. #UNIQUENAME(childDSFields); // Contains a delimited list of child dataset attributes (full names) along with their indexed positions
  198. #UNIQUENAME(fieldCount); // Contains the number of fields we've seen while processing record layouts
  199. #UNIQUENAME(recLevel); // Will be used to determine at which level we are processing
  200. #UNIQUENAME(fieldStack); // String-based stack telling us whether we're within an embedded dataset or record
  201. #UNIQUENAME(namePrefix); // When processing child records and datasets, contains the leading portion of the attribute's full name
  202. #UNIQUENAME(fullName); // The full name of an attribute
  203. #UNIQUENAME(needsDelim); // Boolean indicating whether we need to insert a delimiter somewhere
  204. #UNIQUENAME(namePos); // Contains character offset information, for parsing delimited strings
  205. #UNIQUENAME(numValue); // Extracted numeric value from a string
  206. #UNIQUENAME(nameValue); // Extracted string value from a string
  207. IMPORT Std;
  208. //--------------------------------------------------------------------------
  209. // Remove all spaces from features list so we can parse it more easily
  210. #UNIQUENAME(trimmedFeatures);
  211. LOCAL %trimmedFeatures% := TRIM(features, ALL);
  212. // Remove all spaces from field list so we can parse it more easily
  213. #UNIQUENAME(trimmedFieldList);
  214. LOCAL %trimmedFieldList% := TRIM(fieldListStr, ALL);
  215. // Clamp lcbLimit to 0..500
  216. #UNIQUENAME(lowCardinalityThreshold);
  217. LOCAL %lowCardinalityThreshold% := MIN(MAX(lcbLimit, 0), 500);
  218. // The maximum number of mode values to return
  219. #UNIQUENAME(MAX_MODES);
  220. LOCAL %MAX_MODES% := 5;
  221. // Typedefs
  222. #UNIQUENAME(Attribute_t);
  223. LOCAL %Attribute_t% := STRING;
  224. #UNIQUENAME(AttributeType_t);
  225. LOCAL %AttributeType_t% := STRING36;
  226. #UNIQUENAME(NumericStat_t);
  227. LOCAL %NumericStat_t% := DECIMAL32_4;
  228. // Tests for enabled features
  229. #UNIQUENAME(FeatureEnabledFillRate);
  230. LOCAL %FeatureEnabledFillRate%() := REGEXFIND('\\bfill_rate\\b', %trimmedFeatures%, NOCASE);
  231. #UNIQUENAME(FeatureEnabledBestECLTypes);
  232. LOCAL %FeatureEnabledBestECLTypes%() := REGEXFIND('\\bbest_ecl_types\\b', %trimmedFeatures%, NOCASE);
  233. #UNIQUENAME(FeatureEnabledLowCardinalityBreakdown);
  234. LOCAL %FeatureEnabledLowCardinalityBreakdown%() := %lowCardinalityThreshold% > 0 AND REGEXFIND('\\bcardinality_breakdown\\b', %trimmedFeatures%, NOCASE);
  235. #UNIQUENAME(FeatureEnabledCardinality);
  236. LOCAL %FeatureEnabledCardinality%() := %FeatureEnabledLowCardinalityBreakdown%() OR REGEXFIND('\\bcardinality\\b', %trimmedFeatures%, NOCASE);
  237. #UNIQUENAME(FeatureEnabledModes);
  238. LOCAL %FeatureEnabledModes%() := REGEXFIND('\\bmodes\\b', %trimmedFeatures%, NOCASE);
  239. #UNIQUENAME(FeatureEnabledLengths);
  240. LOCAL %FeatureEnabledLengths%() := REGEXFIND('\\blengths\\b', %trimmedFeatures%, NOCASE);
  241. #UNIQUENAME(FeatureEnabledPatterns);
  242. LOCAL %FeatureEnabledPatterns%() := (UNSIGNED)maxPatterns > 0 AND REGEXFIND('\\bpatterns\\b', %trimmedFeatures%, NOCASE);
  243. #UNIQUENAME(FeatureEnabledMinMax);
  244. LOCAL %FeatureEnabledMinMax%() := REGEXFIND('\\bmin_max\\b', %trimmedFeatures%, NOCASE);
  245. #UNIQUENAME(FeatureEnabledMean);
  246. LOCAL %FeatureEnabledMean%() := REGEXFIND('\\bmean\\b', %trimmedFeatures%, NOCASE);
  247. #UNIQUENAME(FeatureEnabledStdDev);
  248. LOCAL %FeatureEnabledStdDev%() := REGEXFIND('\\bstd_dev\\b', %trimmedFeatures%, NOCASE);
  249. #UNIQUENAME(FeatureEnabledQuartiles);
  250. LOCAL %FeatureEnabledQuartiles%() := REGEXFIND('\\bquartiles\\b', %trimmedFeatures%, NOCASE);
  251. #UNIQUENAME(FeatureEnabledCorrelations);
  252. LOCAL %FeatureEnabledCorrelations%() := REGEXFIND('\\bcorrelations\\b', %trimmedFeatures%, NOCASE);
  253. //--------------------------------------------------------------------------
  254. // Ungroup the given dataset, in case it was grouped
  255. #UNIQUENAME(ungroupedInFile);
  256. LOCAL %ungroupedInFile% := UNGROUP(inFile);
  257. // Clamp the sample size to something reasonable
  258. #UNIQUENAME(clampedSampleSize);
  259. LOCAL %clampedSampleSize% := MAX(1, MIN(100, (INTEGER)sampleSize));
  260. // Create a sample dataset if needed
  261. #UNIQUENAME(sampledData);
  262. LOCAL %sampledData% := IF
  263. (
  264. %clampedSampleSize% < 100,
  265. ENTH(%ungroupedInFile%, %clampedSampleSize%, 100, 1, LOCAL),
  266. %ungroupedInFile%
  267. );
  268. // Slim the dataset if the caller provided an explicit set of attributes;
  269. // note that the TABLE function will fail if %trimmedFieldList% cites an
  270. // attribute that is a child dataset (this is an ECL limitation)
  271. #UNIQUENAME(workingInFile);
  272. LOCAL %workingInFile% :=
  273. #IF(%trimmedFieldList% = '')
  274. %sampledData%
  275. #ELSE
  276. TABLE(%sampledData%, {#EXPAND(%trimmedFieldList%)})
  277. #END;
  278. // Distribute the inbound dataset across all our nodes for faster processing
  279. #UNIQUENAME(distributedInFile);
  280. LOCAL %distributedInFile% := DISTRIBUTE(%workingInFile%, SKEW(0.05));
  281. #EXPORTXML(inFileFields, RECORDOF(%distributedInFile%));
  282. // Walk the slimmed dataset, pulling out top-level scalars and noting
  283. // child datasets
  284. #SET(scalarFields, '');
  285. #SET(childDSFields, '');
  286. #SET(fieldCount, 0);
  287. #SET(recLevel, 0);
  288. #SET(fieldStack, '');
  289. #SET(namePrefix, '');
  290. #SET(fullName, '');
  291. #FOR(inFileFields)
  292. #FOR(Field)
  293. #SET(fieldCount, %fieldCount% + 1)
  294. #IF(%{@isEnd}% != 1)
  295. // Adjust full name
  296. #SET(fullName, %'namePrefix'% + %'@name'%)
  297. #END
  298. #IF(%{@isRecord}% = 1)
  299. // Push record onto stack so we know what we're popping when we see @isEnd
  300. #SET(fieldStack, 'r' + %'fieldStack'%)
  301. #APPEND(namePrefix, %'@name'% + '.')
  302. #ELSEIF(%{@isDataset}% = 1)
  303. // Push dataset onto stack so we know what we're popping when we see @isEnd
  304. #SET(fieldStack, 'd' + %'fieldStack'%)
  305. #APPEND(namePrefix, %'@name'% + '.')
  306. #SET(recLevel, %recLevel% + 1)
  307. // Note the field index and field name so we can process it separately
  308. #IF(%'childDSFields'% != '')
  309. #APPEND(childDSFields, ',')
  310. #END
  311. #APPEND(childDSFields, %'fieldCount'% + ':' + %'fullName'%)
  312. // Extract the child dataset into its own attribute so we can more easily
  313. // process it later
  314. #SET(temp, #MANGLE(%'fullName'%));
  315. LOCAL %temp% := NORMALIZE
  316. (
  317. %distributedInFile%,
  318. LEFT.%fullName%,
  319. TRANSFORM
  320. (
  321. RECORDOF(%distributedInFile%.%fullName%),
  322. SELF := RIGHT
  323. )
  324. );
  325. #ELSEIF(%{@isEnd}% = 1)
  326. #SET(namePrefix, REGEXREPLACE('\\w+\\.$', %'namePrefix'%, ''))
  327. #IF(%'fieldStack'%[1] = 'd')
  328. #SET(recLevel, %recLevel% - 1)
  329. #END
  330. #SET(fieldStack, %'fieldStack'%[2..])
  331. #ELSEIF(%recLevel% = 0)
  332. // Note the field index and full name of the attribute so we can process it
  333. #IF(%'scalarFields'% != '')
  334. #APPEND(scalarFields, ',')
  335. #END
  336. #APPEND(scalarFields, %'fieldCount'% + ':' + %'fullName'%)
  337. #END
  338. #END
  339. #END
  340. // Collect the gathered full attribute names so we can walk them later
  341. #SET(explicitScalarFields, REGEXREPLACE('\\d+:', %'scalarFields'%, ''));
  342. // Define the record layout that will be used by the inner _Inner_Profile() call
  343. LOCAL ModeRec := RECORD
  344. STRING value;
  345. UNSIGNED4 rec_count;
  346. END;
  347. LOCAL PatternCountRec := RECORD
  348. STRING data_pattern;
  349. UNSIGNED4 rec_count;
  350. STRING example;
  351. END;
  352. LOCAL CorrelationRec := RECORD
  353. STRING attribute;
  354. DECIMAL7_6 corr;
  355. END;
  356. LOCAL OutputLayout := RECORD
  357. STRING sortValue;
  358. STRING attribute;
  359. UNSIGNED4 rec_count;
  360. STRING given_attribute_type;
  361. DECIMAL9_6 fill_rate;
  362. UNSIGNED4 fill_count;
  363. UNSIGNED4 cardinality;
  364. DATASET(ModeRec) cardinality_breakdown {MAXCOUNT(%lowCardinalityThreshold%)};
  365. STRING best_attribute_type;
  366. DATASET(ModeRec) modes {MAXCOUNT(%MAX_MODES%)};
  367. UNSIGNED4 min_length;
  368. UNSIGNED4 max_length;
  369. UNSIGNED4 ave_length;
  370. DATASET(PatternCountRec) popular_patterns {MAXCOUNT((UNSIGNED)maxPatterns)};
  371. DATASET(PatternCountRec) rare_patterns {MAXCOUNT((UNSIGNED)maxPatterns)};
  372. BOOLEAN is_numeric;
  373. %NumericStat_t% numeric_min;
  374. %NumericStat_t% numeric_max;
  375. %NumericStat_t% numeric_mean;
  376. %NumericStat_t% numeric_std_dev;
  377. %NumericStat_t% numeric_lower_quartile;
  378. %NumericStat_t% numeric_median;
  379. %NumericStat_t% numeric_upper_quartile;
  380. DATASET(CorrelationRec) numeric_correlations {MAXCOUNT(%fieldCount%)};
  381. END;
  382. // Define the record layout that will be returned to the caller; note
  383. // that the structure is variable, depending on the features passed
  384. // to Profile()
  385. #UNIQUENAME(FinalOutputLayout);
  386. LOCAL %FinalOutputLayout% := RECORD
  387. STRING attribute;
  388. STRING given_attribute_type;
  389. #IF(%FeatureEnabledBestECLTypes%())
  390. STRING best_attribute_type;
  391. #END
  392. UNSIGNED4 rec_count;
  393. #IF(%FeatureEnabledFillRate%())
  394. UNSIGNED4 fill_count;
  395. DECIMAL9_6 fill_rate;
  396. #END
  397. #IF(%FeatureEnabledCardinality%())
  398. UNSIGNED4 cardinality;
  399. #END
  400. #IF(%FeatureEnabledLowCardinalityBreakdown%())
  401. DATASET(ModeRec) cardinality_breakdown;
  402. #END
  403. #IF(%FeatureEnabledModes%())
  404. DATASET(ModeRec) modes;
  405. #END
  406. #IF(%FeatureEnabledLengths%())
  407. UNSIGNED4 min_length;
  408. UNSIGNED4 max_length;
  409. UNSIGNED4 ave_length;
  410. #END
  411. #IF(%FeatureEnabledPatterns%())
  412. DATASET(PatternCountRec) popular_patterns;
  413. DATASET(PatternCountRec) rare_patterns;
  414. #END
  415. #IF(%FeatureEnabledMinMax%() OR %FeatureEnabledMean%() OR %FeatureEnabledStdDev%() OR %FeatureEnabledQuartiles%() OR %FeatureEnabledCorrelations%())
  416. BOOLEAN is_numeric;
  417. #END
  418. #IF(%FeatureEnabledMinMax%())
  419. %NumericStat_t% numeric_min;
  420. %NumericStat_t% numeric_max;
  421. #END
  422. #IF(%FeatureEnabledMean%())
  423. %NumericStat_t% numeric_mean;
  424. #END
  425. #IF(%FeatureEnabledStdDev%())
  426. %NumericStat_t% numeric_std_dev;
  427. #END
  428. #IF(%FeatureEnabledQuartiles%())
  429. %NumericStat_t% numeric_lower_quartile;
  430. %NumericStat_t% numeric_median;
  431. %NumericStat_t% numeric_upper_quartile;
  432. #END
  433. #IF(%FeatureEnabledCorrelations%())
  434. DATASET(CorrelationRec) numeric_correlations;
  435. #END
  436. END;
  437. //==========================================================================
  438. // This is the meat of the function macro that actually does the profiling;
  439. // it is called with various datasets and (possibly) explicit attributes
  440. // to process and the results will eventually be combined to form the
  441. // final result; the parameters largely match the Profile() call, with the
  442. // addition of a few parameters that help place the results into the
  443. // correct format; note that the name of this function macro is not wrapped
  444. // in a UNIQUENAME -- that is due to an apparent limitation in the ECL
  445. // compiler
  446. LOCAL _Inner_Profile(_inFile,
  447. _fieldListStr,
  448. _maxPatterns,
  449. _maxPatternLen,
  450. _lcbLimit,
  451. _maxModes,
  452. _resultLayout,
  453. _attrNamePrefix,
  454. _sortPrefix) := FUNCTIONMACRO
  455. #EXPORTXML(inFileFields, RECORDOF(_inFile));
  456. #UNIQUENAME(foundMaxPatternLen); // Will become the length of the longest pattern we will be processing
  457. #SET(foundMaxPatternLen, 33); // Preset to minimum length for an attribute pattern
  458. #UNIQUENAME(explicitFields); // Attributes from _fieldListStr that are found in the top level of the dataset
  459. #UNIQUENAME(numericFields); // Numeric attributes from _fieldListStr that are found in the top level of the dataset
  460. // Validate that attribute is okay for us to process (there is no explicit
  461. // attribute list or the name is in the list)
  462. #UNIQUENAME(_CanProcessAttribute);
  463. LOCAL %_CanProcessAttribute%(STRING attrName) := (_fieldListStr = '' OR REGEXFIND('(^|,)' + attrName + '(,|$)', _fieldListStr, NOCASE));
  464. // Test an attribute type to see if is a SET OF <something>
  465. #UNIQUENAME(_IsSetType);
  466. LOCAL %_IsSetType%(STRING attrType) := (attrType[..7] = 'set of ');
  467. // Helper function to convert a full field name into something we
  468. // can reference as an ECL attribute
  469. #UNIQUENAME(_MakeAttr);
  470. LOCAL %_MakeAttr%(STRING attr) := REGEXREPLACE('\\.', attr, '_');
  471. // Pattern mapping a STRING datatype
  472. #UNIQUENAME(_MapAllStr);
  473. LOCAL STRING %_MapAllStr%(STRING s) := EMBED(C++)
  474. __lenResult = lenS;
  475. __result = static_cast<char*>(rtlMalloc(__lenResult));
  476. for (uint32_t x = 0; x < lenS; x++)
  477. {
  478. unsigned char ch = s[x];
  479. if (ch >= 'A' && ch <= 'Z')
  480. __result[x] = 'A';
  481. else if (ch >= 'a' && ch <= 'z')
  482. __result[x] = 'a';
  483. else if (ch >= '1' && ch <= '9') // Leave '0' as-is and replace with '9' later
  484. __result[x] = '9';
  485. else
  486. __result[x] = ch;
  487. }
  488. ENDEMBED;
  489. // Pattern mapping a UNICODE datatype; using regex due to the complexity
  490. // of the character set
  491. #UNIQUENAME(_MapUpperCharUni);
  492. LOCAL %_MapUpperCharUni%(UNICODE s) := REGEXREPLACE(u'[[:upper:]]', s, u'A');
  493. #UNIQUENAME(_MapLowerCharUni);
  494. LOCAL %_MapLowerCharUni%(UNICODE s) := REGEXREPLACE(u'[[:lower:]]', s, u'a');
  495. #UNIQUENAME(_MapDigitUni);
  496. LOCAL %_MapDigitUni%(UNICODE s) := REGEXREPLACE(u'[1-9]', s, u'9'); // Leave '0' as-is and replace with '9' later
  497. #UNIQUENAME(_MapAllUni);
  498. LOCAL %_MapAllUni%(UNICODE s) := (STRING)%_MapDigitUni%(%_MapLowerCharUni%(%_MapUpperCharUni%(s)));
  499. // Trimming strings
  500. #UNIQUENAME(_TrimmedStr);
  501. LOCAL %_TrimmedStr%(STRING s) := TRIM(s, LEFT, RIGHT);
  502. #UNIQUENAME(_TrimmedUni);
  503. LOCAL %_TrimmedUni%(UNICODE s) := TRIM(s, LEFT, RIGHT);
  504. // Collect a list of the top-level attributes that we can process,
  505. // determine the actual maximum length of a data pattern (if we can
  506. // reduce that length then we can save on memory allocation), and
  507. // collect the numeric fields for correlation
  508. #SET(needsDelim, 0);
  509. #SET(recLevel, 0);
  510. #SET(fieldStack, '');
  511. #SET(namePrefix, '');
  512. #SET(explicitFields, '');
  513. #SET(numericFields, '');
  514. #FOR(inFileFields)
  515. #FOR(Field)
  516. #IF(%{@isRecord}% = 1)
  517. #SET(fieldStack, 'r' + %'fieldStack'%)
  518. #APPEND(namePrefix, %'@name'% + '.')
  519. #ELSEIF(%{@isDataset}% = 1)
  520. #SET(fieldStack, 'd' + %'fieldStack'%)
  521. #SET(recLevel, %recLevel% + 1)
  522. #ELSEIF(%{@isEnd}% = 1)
  523. #IF(%'fieldStack'%[1] = 'd')
  524. #SET(recLevel, %recLevel% - 1)
  525. #ELSE
  526. #SET(namePrefix, REGEXREPLACE('\\w+\\.$', %'namePrefix'%, ''))
  527. #END
  528. #SET(fieldStack, %'fieldStack'%[2..])
  529. #ELSEIF(%recLevel% = 0)
  530. #IF(%_CanProcessAttribute%(%'namePrefix'% + %'@name'%))
  531. #IF(%needsDelim% = 1)
  532. #APPEND(explicitFields, ',')
  533. #END
  534. #APPEND(explicitFields, %'namePrefix'% + %'@name'%)
  535. #SET(needsDelim, 1)
  536. #IF(NOT %_IsSetType%(%'@type'%))
  537. #IF(REGEXFIND('(string)|(data)|(utf)', %'@type'%))
  538. #IF(%@size% < 0)
  539. #SET(foundMaxPatternLen, MAX(_maxPatternLen, %foundMaxPatternLen%))
  540. #ELSE
  541. #SET(foundMaxPatternLen, MIN(MAX(%@size%, %foundMaxPatternLen%), _maxPatternLen))
  542. #END
  543. #ELSEIF(REGEXFIND('unicode', %'@type'%))
  544. // UNICODE is UCS-2 so the size reflects two bytes per character
  545. #IF(%@size% < 0)
  546. #SET(foundMaxPatternLen, MAX(_maxPatternLen, %foundMaxPatternLen%))
  547. #ELSE
  548. #SET(foundMaxPatternLen, MIN(MAX(%@size% DIV 2 + 1, %foundMaxPatternLen%), _maxPatternLen))
  549. #END
  550. #ELSEIF(REGEXFIND('(integer)|(unsigned)|(decimal)|(real)', %'@type'%))
  551. #IF(%'numericFields'% != '')
  552. #APPEND(numericFields, ',')
  553. #END
  554. #APPEND(numericFields, %'namePrefix'% + %'@name'%)
  555. #END
  556. #END
  557. #END
  558. #END
  559. #END
  560. #END
  561. // Typedefs
  562. #UNIQUENAME(DataPattern_t);
  563. LOCAL %DataPattern_t% := #EXPAND('STRING' + %'foundMaxPatternLen'%);
  564. #UNIQUENAME(StringValue_t);
  565. LOCAL %StringValue_t% := #EXPAND('STRING' + %'foundMaxPatternLen'%);
  566. // Create a dataset containing pattern information, string length, and
  567. // booleans indicating filled and numeric datatypes for each processed
  568. // attribute; note that this is created by appending a series of PROJECT
  569. // results; to protect against skew problems when dealing with attributes
  570. // with low cardinality, and to attempt to reduce our temporary storage
  571. // footprint, create a reduced dataset that contains unique values for
  572. // our attributes and the number of times the values appear, as well as
  573. // some of the other interesting bits we can collect at the same time; note
  574. // that we try to explicitly target the original attribute's data type and
  575. // perform the minimal amount of work necessary on the value to transform
  576. // it to our common structure
  577. #UNIQUENAME(DataInfoRec);
  578. LOCAL %DataInfoRec% := RECORD
  579. %Attribute_t% attribute;
  580. %AttributeType_t% given_attribute_type;
  581. %StringValue_t% string_value;
  582. UNSIGNED4 value_count;
  583. %DataPattern_t% data_pattern;
  584. UNSIGNED4 data_length;
  585. BOOLEAN is_filled;
  586. BOOLEAN is_number;
  587. END;
  588. #UNIQUENAME(dataInfo);
  589. LOCAL %dataInfo% :=
  590. #SET(recLevel, 0)
  591. #SET(fieldStack, '')
  592. #SET(namePrefix, '')
  593. #SET(needsDelim, 0)
  594. #SET(fieldCount, 0)
  595. #FOR(inFileFields)
  596. #FOR(Field)
  597. #IF(%{@isRecord}% = 1)
  598. #SET(fieldStack, 'r' + %'fieldStack'%)
  599. #APPEND(namePrefix, %'@name'% + '.')
  600. #ELSEIF(%{@isDataset}% = 1)
  601. #SET(fieldStack, 'd' + %'fieldStack'%)
  602. #SET(recLevel, %recLevel% + 1)
  603. #ELSEIF(%{@isEnd}% = 1)
  604. #IF(%'fieldStack'%[1] = 'd')
  605. #SET(recLevel, %recLevel% - 1)
  606. #ELSE
  607. #SET(namePrefix, REGEXREPLACE('\\w+\\.$', %'namePrefix'%, ''))
  608. #END
  609. #SET(fieldStack, %'fieldStack'%[2..])
  610. #ELSEIF(%recLevel% = 0)
  611. #IF(%_CanProcessAttribute%(%'namePrefix'% + %'@name'%))
  612. #SET(fieldCount, %fieldCount% + 1)
  613. #IF(%needsDelim% = 1) + #END
  614. IF(EXISTS(_inFile),
  615. PROJECT
  616. (
  617. TABLE
  618. (
  619. _inFile,
  620. {
  621. %Attribute_t% attribute := %'namePrefix'% + %'@name'%,
  622. %AttributeType_t% given_attribute_type := %'@ecltype'%,
  623. %StringValue_t% string_value :=
  624. #IF(%_IsSetType%(%'@type'%))
  625. (%StringValue_t%)Std.Str.CombineWords((SET OF STRING)_inFile.#EXPAND(%'namePrefix'% + %'@name'%), ', ')
  626. #ELSEIF(REGEXFIND('(integer)|(unsigned)|(decimal)|(real)|(boolean)', %'@type'%))
  627. (%StringValue_t%)_inFile.#EXPAND(%'namePrefix'% + %'@name'%)
  628. #ELSEIF(REGEXFIND('string', %'@type'%))
  629. %_TrimmedStr%(_inFile.#EXPAND(%'namePrefix'% + %'@name'%))
  630. #ELSE
  631. %_TrimmedStr%((%StringValue_t%)_inFile.#EXPAND(%'namePrefix'% + %'@name'%))
  632. #END,
  633. UNSIGNED4 value_count := COUNT(GROUP),
  634. %DataPattern_t% data_pattern :=
  635. #IF(%_IsSetType%(%'@type'%))
  636. %_MapAllStr%(%_TrimmedStr%(Std.Str.CombineWords((SET OF STRING)_inFile.#EXPAND(%'namePrefix'% + %'@name'%), ', '))[..%foundMaxPatternLen%])
  637. #ELSEIF(REGEXFIND('(integer)|(unsigned)|(decimal)|(real)', %'@type'%))
  638. %_MapAllStr%((STRING)_inFile.#EXPAND(%'namePrefix'% + %'@name'%))
  639. #ELSEIF(REGEXFIND('(unicode)|(utf)', %'@type'%))
  640. #IF(%@size% < 0 OR (%@size% DIV 2 + 1) > %foundMaxPatternLen%)
  641. %_MapAllUni%(%_TrimmedUni%((UNICODE)_inFile.#EXPAND(%'namePrefix'% + %'@name'%))[..%foundMaxPatternLen%])
  642. #ELSE
  643. %_MapAllUni%(%_TrimmedUni%((UNICODE)_inFile.#EXPAND(%'namePrefix'% + %'@name'%)))
  644. #END
  645. #ELSEIF(REGEXFIND('string', %'@type'%))
  646. #IF(%@size% < 0 OR %@size% > %foundMaxPatternLen%)
  647. %_MapAllStr%(%_TrimmedStr%(_inFile.#EXPAND(%'namePrefix'% + %'@name'%))[..%foundMaxPatternLen%])
  648. #ELSE
  649. %_MapAllStr%(%_TrimmedStr%(_inFile.#EXPAND(%'namePrefix'% + %'@name'%)))
  650. #END
  651. #ELSEIF(%'@type'% = 'boolean')
  652. 'B'
  653. #ELSE
  654. %_MapAllStr%(%_TrimmedStr%((STRING)_inFile.#EXPAND(%'namePrefix'% + %'@name'%))[..%foundMaxPatternLen%])
  655. #END,
  656. UNSIGNED4 data_length :=
  657. #IF(%_IsSetType%(%'@type'%))
  658. COUNT(_inFile.#EXPAND(%'namePrefix'% + %'@name'%))
  659. #ELSEIF(REGEXFIND('(unicode)|(utf)', %'@type'%))
  660. LENGTH(%_TrimmedUni%((UNICODE)_inFile.#EXPAND(%'namePrefix'% + %'@name'%)))
  661. #ELSEIF(REGEXFIND('string', %'@type'%))
  662. LENGTH(%_TrimmedStr%(_inFile.#EXPAND(%'namePrefix'% + %'@name'%)))
  663. #ELSEIF(%'@type'% = 'boolean')
  664. 1
  665. #ELSE
  666. LENGTH((STRING)_inFile.#EXPAND(%'namePrefix'% + %'@name'%))
  667. #END,
  668. BOOLEAN is_filled :=
  669. #IF(%_IsSetType%(%'@type'%))
  670. COUNT(_inFile.#EXPAND(%'namePrefix'% + %'@name'%)) > 0
  671. #ELSEIF(REGEXFIND('(unicode)|(utf)', %'@type'%))
  672. LENGTH(%_TrimmedUni%(_inFile.#EXPAND(%'namePrefix'% + %'@name'%))) > 0
  673. #ELSEIF(REGEXFIND('string', %'@type'%))
  674. LENGTH(%_TrimmedStr%(_inFile.#EXPAND(%'namePrefix'% + %'@name'%))) > 0
  675. #ELSEIF(REGEXFIND('data', %'@type'%))
  676. LENGTH(_inFile.#EXPAND(%'namePrefix'% + %'@name'%)) > 0
  677. #ELSEIF(%'@type'% = 'boolean')
  678. TRUE
  679. #ELSE
  680. _inFile.#EXPAND(%'namePrefix'% + %'@name'%) != 0
  681. #END,
  682. BOOLEAN is_number :=
  683. #IF(%_IsSetType%(%'@type'%))
  684. FALSE
  685. #ELSEIF(REGEXFIND('(integer)|(unsigned)|(decimal)|(real)', %'@type'%))
  686. TRUE
  687. #ELSE
  688. FALSE
  689. #END
  690. },
  691. _inFile.#EXPAND(%'namePrefix'% + %'@name'%),
  692. LOCAL
  693. ),
  694. TRANSFORM(%DataInfoRec%, SELF := LEFT)
  695. ),
  696. DATASET
  697. (
  698. 1,
  699. TRANSFORM
  700. (
  701. %DataInfoRec%,
  702. SELF.attribute := %'namePrefix'% + %'@name'%,
  703. SELF.given_attribute_type := %'@ecltype'%,
  704. SELF := []
  705. )
  706. )
  707. )
  708. #SET(needsDelim, 1)
  709. #END
  710. #END
  711. #END
  712. #END
  713. // Insert empty value for syntax checking
  714. #IF(%fieldCount% = 0)
  715. DATASET([], %DataInfoRec%)
  716. #END;
  717. // Get only those attributes that are filled
  718. #UNIQUENAME(filledDataInfo);
  719. LOCAL %filledDataInfo% := %dataInfo%(is_filled);
  720. // Determine the best ECL data type for each attribute
  721. #UNIQUENAME(DataTypeEnum);
  722. LOCAL %DataTypeEnum% := ENUM
  723. (
  724. UNSIGNED4,
  725. AsIs = 0,
  726. SignedInteger = 1,
  727. UnsignedInteger = 2,
  728. FloatingPoint = 4,
  729. ExpNotation = 8
  730. );
  731. #UNIQUENAME(BestTypeFlag);
  732. LOCAL %DataTypeEnum% %BestTypeFlag%(STRING dataPattern, %AttributeType_t% attributeType) := FUNCTION
  733. isLeadingZeroInteger := REGEXFIND('^0[09]{1,18}$', dataPattern);
  734. isSignedInteger := REGEXFIND('^\\-[09]{1,19}$', dataPattern);
  735. isShortUnsignedInteger := REGEXFIND('^[09]{1,19}$', dataPattern);
  736. isUnsignedInteger := REGEXFIND('^\\+?[09]{1,20}$', dataPattern);
  737. isFloatingPoint := REGEXFIND('^(\\-|\\+)?[09]{0,15}\\.[09]{1,15}$', dataPattern);
  738. isExpNotation := REGEXFIND('^(\\-|\\+)?[09]\\.[09]{1,6}[aA]\\-[09]{1,3}$', dataPattern);
  739. stringWithNumbersType := MAP
  740. (
  741. isSignedInteger => %DataTypeEnum%.SignedInteger | %DataTypeEnum%.FloatingPoint | %DataTypeEnum%.ExpNotation,
  742. isShortUnsignedInteger => %DataTypeEnum%.SignedInteger | %DataTypeEnum%.UnsignedInteger | %DataTypeEnum%.FloatingPoint | %DataTypeEnum%.ExpNotation,
  743. isUnsignedInteger => %DataTypeEnum%.UnsignedInteger | %DataTypeEnum%.FloatingPoint | %DataTypeEnum%.ExpNotation,
  744. isFloatingPoint => %DataTypeEnum%.FloatingPoint | %DataTypeEnum%.ExpNotation,
  745. isExpNotation => %DataTypeEnum%.ExpNotation,
  746. %DataTypeEnum%.AsIs
  747. );
  748. bestType := MAP
  749. (
  750. %_IsSetType%(attributeType) => %DataTypeEnum%.AsIs,
  751. REGEXFIND('(integer)|(unsigned)|(decimal)|(real)|(boolean)', attributeType) => %DataTypeEnum%.AsIs,
  752. isLeadingZeroInteger => %DataTypeEnum%.AsIs,
  753. stringWithNumbersType
  754. );
  755. RETURN bestType;
  756. END;
  757. // Estimate integer size from readable data length
  758. #UNIQUENAME(Len2Size);
  759. LOCAL %Len2Size%(UNSIGNED2 c) := MAP ( c < 3 => 1, c < 5 => 2, c < 7 => 3, c < 9 => 4, c < 11 => 5, c < 14 => 6, c < 16 => 7, 8 );
  760. #UNIQUENAME(attributeTypePatterns);
  761. LOCAL %attributeTypePatterns% := TABLE
  762. (
  763. %filledDataInfo%,
  764. {
  765. attribute,
  766. given_attribute_type,
  767. data_pattern,
  768. data_length,
  769. %DataTypeEnum% type_flag := %BestTypeFlag%(TRIM(data_pattern), given_attribute_type),
  770. UNSIGNED4 min_data_length := 0 // will be populated within %attributesWithTypeFlagsSummary%
  771. },
  772. attribute, given_attribute_type, data_pattern, data_length,
  773. MERGE
  774. );
  775. #UNIQUENAME(MinNotZero);
  776. LOCAL %MinNotZero%(UNSIGNED4 n1, UNSIGNED4 n2) := MAP
  777. (
  778. n1 = 0 => n2,
  779. n2 = 0 => n1,
  780. MIN(n1, n2)
  781. );
  782. #UNIQUENAME(attributesWithTypeFlagsSummary);
  783. LOCAL %attributesWithTypeFlagsSummary% := AGGREGATE
  784. (
  785. %attributeTypePatterns%,
  786. RECORDOF(%attributeTypePatterns%),
  787. TRANSFORM
  788. (
  789. RECORDOF(%attributeTypePatterns%),
  790. SELF.data_length := MAX(LEFT.data_length, RIGHT.data_length),
  791. SELF.min_data_length := %MinNotZero%(LEFT.data_length, RIGHT.data_length),
  792. SELF.type_flag := IF(TRIM(RIGHT.attribute) != '', LEFT.type_flag & RIGHT.type_flag, LEFT.type_flag),
  793. SELF := LEFT
  794. ),
  795. TRANSFORM
  796. (
  797. RECORDOF(%attributeTypePatterns%),
  798. SELF.data_length := MAX(RIGHT1.data_length, RIGHT2.data_length),
  799. SELF.min_data_length := %MinNotZero%(RIGHT1.data_length, RIGHT2.data_length),
  800. SELF.type_flag := RIGHT1.type_flag & RIGHT2.type_flag,
  801. SELF := RIGHT1
  802. ),
  803. LEFT.attribute,
  804. FEW
  805. );
  806. #UNIQUENAME(AttributeTypeRec);
  807. LOCAL %AttributeTypeRec% := RECORD
  808. %Attribute_t% attribute;
  809. %AttributeType_t% given_attribute_type;
  810. %AttributeType_t% best_attribute_type;
  811. END;
  812. #UNIQUENAME(attributeBestTypeInfo);
  813. LOCAL %attributeBestTypeInfo% := PROJECT
  814. (
  815. %attributesWithTypeFlagsSummary%,
  816. TRANSFORM
  817. (
  818. %AttributeTypeRec%,
  819. SELF.best_attribute_type := MAP
  820. (
  821. %_IsSetType%(LEFT.given_attribute_type) => LEFT.given_attribute_type,
  822. REGEXFIND('(integer)|(unsigned)|(decimal)|(real)|(boolean)', LEFT.given_attribute_type) => LEFT.given_attribute_type,
  823. REGEXFIND('data', LEFT.given_attribute_type) => 'data' + IF(LEFT.data_length > 0 AND (LEFT.data_length < (LEFT.min_data_length * 1000)), (STRING)LEFT.data_length, ''),
  824. (LEFT.type_flag & %DataTypeEnum%.UnsignedInteger) != 0 => 'unsigned' + %Len2Size%(LEFT.data_length),
  825. (LEFT.type_flag & %DataTypeEnum%.SignedInteger) != 0 => 'integer' + %Len2Size%(LEFT.data_length),
  826. (LEFT.type_flag & %DataTypeEnum%.FloatingPoint) != 0 => 'real' + IF(LEFT.data_length < 8, '4', '8'),
  827. (LEFT.type_flag & %DataTypeEnum%.ExpNotation) != 0 => 'real8',
  828. REGEXFIND('utf', LEFT.given_attribute_type) => LEFT.given_attribute_type,
  829. REGEXREPLACE('\\d+$', TRIM(LEFT.given_attribute_type), '') + IF(LEFT.data_length > 0 AND (LEFT.data_length < (LEFT.min_data_length * 1000)), (STRING)LEFT.data_length, '')
  830. ),
  831. SELF := LEFT
  832. )
  833. );
  834. // Build a set of attributes for quartiles, unique values, and modes for
  835. // each processed attribute
  836. #SET(recLevel, 0);
  837. #SET(fieldStack, '');
  838. #SET(namePrefix, '');
  839. #FOR(inFileFields)
  840. #FOR(Field)
  841. #IF(%{@isRecord}% = 1)
  842. #SET(fieldStack, 'r' + %'fieldStack'%)
  843. #APPEND(namePrefix, %'@name'% + '.')
  844. #ELSEIF(%{@isDataset}% = 1)
  845. #SET(fieldStack, 'd' + %'fieldStack'%)
  846. #SET(recLevel, %recLevel% + 1)
  847. #ELSEIF(%{@isEnd}% = 1)
  848. #IF(%'fieldStack'%[1] = 'd')
  849. #SET(recLevel, %recLevel% - 1)
  850. #ELSE
  851. #SET(namePrefix, REGEXREPLACE('\\w+\\.$', %'namePrefix'%, ''))
  852. #END
  853. #SET(fieldStack, %'fieldStack'%[2..]);
  854. #ELSEIF(%recLevel% = 0)
  855. #IF(%_CanProcessAttribute%(%'namePrefix'% + %'@name'%))
  856. // Note that we create explicit attributes here for all
  857. // top-level attributes in the dataset that we're
  858. // processing, even if they are not numeric datatypes
  859. #UNIQUENAME(uniqueNumericValueCounts)
  860. %uniqueNumericValueCounts% := PROJECT
  861. (
  862. %filledDataInfo%(attribute = %'namePrefix'% + %'@name'% AND is_number),
  863. TRANSFORM
  864. (
  865. {
  866. REAL value,
  867. UNSIGNED6 cnt,
  868. UNSIGNED6 valueEndPos
  869. },
  870. SELF.value := (REAL)LEFT.string_value,
  871. SELF.cnt := LEFT.value_count,
  872. SELF.valueEndPos := 0
  873. )
  874. );
  875. // Explicit attributes containing scalars
  876. LOCAL #EXPAND(%_MakeAttr%(%'namePrefix'% + %'@name'% + '_min')) := MIN(%uniqueNumericValueCounts%, value);
  877. LOCAL #EXPAND(%_MakeAttr%(%'namePrefix'% + %'@name'% + '_max')) := MAX(%uniqueNumericValueCounts%, value);
  878. LOCAL #EXPAND(%_MakeAttr%(%'namePrefix'% + %'@name'% + '_ave')) := SUM(%uniqueNumericValueCounts%, value * cnt) / SUM(%uniqueNumericValueCounts%, cnt);
  879. LOCAL #EXPAND(%_MakeAttr%(%'namePrefix'% + %'@name'% + '_std_dev')) := SQRT(SUM(%uniqueNumericValueCounts%, (value - #EXPAND(%_MakeAttr%(%'namePrefix'% + %'@name'% + '_ave'))) * (value - #EXPAND(%_MakeAttr%(%'namePrefix'% + %'@name'% + '_ave'))) * cnt) / SUM(%uniqueNumericValueCounts%, cnt));
  880. // Determine the position of the last record in the original
  881. // dataset that contains a particular value
  882. #UNIQUENAME(uniqueNumericValuePos)
  883. %uniqueNumericValuePos% := ITERATE
  884. (
  885. SORT(%uniqueNumericValueCounts%, value, SKEW(1)),
  886. TRANSFORM
  887. (
  888. RECORDOF(LEFT),
  889. SELF.valueEndPos := LEFT.valueEndPos + RIGHT.cnt,
  890. SELF := RIGHT
  891. )
  892. );
  893. // The total number of records in this subset
  894. #UNIQUENAME(wholeNumRecs)
  895. LOCAL %wholeNumRecs% := MAX(%uniqueNumericValuePos%, valueEndPos);
  896. #UNIQUENAME(halfNumRecs);
  897. LOCAL %halfNumRecs% := %wholeNumRecs% DIV 2;
  898. // Find the median
  899. #UNIQUENAME(q2Pos1);
  900. LOCAL %q2Pos1% := %halfNumRecs% + (%wholeNumRecs% % 2);
  901. #UNIQUENAME(q2Value1);
  902. LOCAL %q2Value1% := MIN(%uniqueNumericValuePos%(valueEndPos >= %q2Pos1%), value);
  903. #UNIQUENAME(q2Pos2);
  904. LOCAL %q2Pos2% := %q2Pos1% + ((%wholeNumRecs% + 1) % 2);
  905. #UNIQUENAME(q2Value2);
  906. LOCAL %q2Value2% := MIN(%uniqueNumericValuePos%(valueEndPos >= %q2Pos2%), value);
  907. LOCAL #EXPAND(%_MakeAttr%(%'namePrefix'% + %'@name'% + '_q2_value')) := AVE(%q2Value1%, %q2Value2%);
  908. // Find the lower quartile
  909. #UNIQUENAME(q1Pos1);
  910. LOCAL %q1Pos1% := (%halfNumRecs% DIV 2) + (%halfNumRecs% % 2);
  911. #UNIQUENAME(q1Value1);
  912. LOCAL %q1Value1% := MIN(%uniqueNumericValuePos%(valueEndPos >= %q1Pos1%), value);
  913. #UNIQUENAME(q1Pos2);
  914. LOCAL %q1Pos2% := %q1Pos1% + ((%halfNumRecs% + 1) % 2);
  915. #UNIQUENAME(q1Value2);
  916. LOCAL %q1Value2% := MIN(%uniqueNumericValuePos%(valueEndPos >= %q1Pos2%), value);
  917. LOCAL #EXPAND(%_MakeAttr%(%'namePrefix'% + %'@name'% + '_q1_value')) := IF(%halfNumRecs% > 0, AVE(%q1Value1%, %q1Value2%), 0);
  918. // Find the upper quartile
  919. #UNIQUENAME(q3Pos1);
  920. LOCAL %q3Pos1% := MAX(%q2Pos1%, %q2Pos2%) + (%halfNumRecs% DIV 2) + (%halfNumRecs% % 2);
  921. #UNIQUENAME(q3Value1);
  922. LOCAL %q3Value1% := MIN(%uniqueNumericValuePos%(valueEndPos >= %q3Pos1%), value);
  923. #UNIQUENAME(q3Pos2);
  924. LOCAL %q3Pos2% := %q3Pos1% - ((%halfNumRecs% + 1) % 2);
  925. #UNIQUENAME(q3Value2);
  926. LOCAL %q3Value2% := MIN(%uniqueNumericValuePos%(valueEndPos >= %q3Pos2%), value);
  927. LOCAL #EXPAND(%_MakeAttr%(%'namePrefix'% + %'@name'% + '_q3_value')) := IF(%halfNumRecs% > 0, AVE(%q3Value1%, %q3Value2%), 0);
  928. // Derive all unique data values and the number of times
  929. // each occurs in the data
  930. LOCAL #EXPAND(%_MakeAttr%(%'namePrefix'% + %'@name'% + '_uniq_value_recs')) := TABLE
  931. (
  932. %filledDataInfo%(attribute = %'namePrefix'% + %'@name'%),
  933. {
  934. string_value,
  935. UNSIGNED4 rec_count := SUM(GROUP, value_count)
  936. },
  937. string_value,
  938. MERGE
  939. );
  940. // Find the mode of the (string) data; using a JOIN here
  941. // to avoid the 10MB limit error that sometimes occurs
  942. // when you use filters to find a single value; also note
  943. // the TOPN calls to reduce the search space, which also
  944. // effectively limit the final result to _maxModes records
  945. #UNIQUENAME(topRecords);
  946. %topRecords% := TOPN(#EXPAND(%_MakeAttr%(%'namePrefix'% + %'@name'% + '_uniq_value_recs')), _maxModes, -rec_count);
  947. #UNIQUENAME(topRecord)
  948. %topRecord% := TOPN(%topRecords%, 1, -rec_count);
  949. LOCAL #EXPAND(%_MakeAttr%(%'namePrefix'% + %'@name'% + '_mode_values')) := JOIN
  950. (
  951. %topRecords%,
  952. %topRecord%,
  953. LEFT.rec_count = RIGHT.rec_count,
  954. TRANSFORM
  955. (
  956. ModeRec,
  957. SELF.value := LEFT.string_value,
  958. SELF.rec_count := LEFT.rec_count
  959. ),
  960. SMART
  961. ) : ONWARNING(4531, IGNORE);
  962. // Get records with low cardinality
  963. LOCAL #EXPAND(%_MakeAttr%(%'namePrefix'% + %'@name'% + '_lcb_recs')) := IF
  964. (
  965. COUNT(#EXPAND(%_MakeAttr%(%'namePrefix'% + %'@name'% + '_uniq_value_recs'))) <= _lcbLimit,
  966. PROJECT
  967. (
  968. SORT(#EXPAND(%_MakeAttr%(%'namePrefix'% + %'@name'% + '_uniq_value_recs')), -rec_count),
  969. TRANSFORM
  970. (
  971. ModeRec,
  972. SELF.value := LEFT.string_value,
  973. SELF.rec_count := LEFT.rec_count
  974. )
  975. ),
  976. DATASET([], ModeRec)
  977. );
  978. #END
  979. #END
  980. #END
  981. #END
  982. // Run correlations on all unique pairs of numeric fields in the data
  983. #UNIQUENAME(BaseCorrelationLayout);
  984. LOCAL %BaseCorrelationLayout% := RECORD
  985. %Attribute_t% attribute_x;
  986. %Attribute_t% attribute_y;
  987. REAL corr;
  988. END;
  989. #UNIQUENAME(corrNamePosX);
  990. #UNIQUENAME(corrNamePosY);
  991. #UNIQUENAME(fieldX);
  992. #UNIQUENAME(fieldY);
  993. #SET(needsDelim, 0);
  994. #UNIQUENAME(correlations0);
  995. LOCAL %correlations0% := DATASET
  996. (
  997. [
  998. #SET(corrNamePosX, 1)
  999. #LOOP
  1000. #SET(fieldX, REGEXFIND('^([^,]+)', %'numericFields'%[%corrNamePosX%..], 1))
  1001. #IF(%'fieldX'% != '')
  1002. #SET(corrNamePosY, %corrNamePosX% + LENGTH(%'fieldX'%) + 1)
  1003. #LOOP
  1004. #SET(fieldY, REGEXFIND('^([^,]+)', %'numericFields'%[%corrNamePosY%..], 1))
  1005. #IF(%'fieldY'% != '')
  1006. #IF(%needsDelim% = 1) , #END
  1007. {
  1008. %'fieldX'%,
  1009. %'fieldY'%,
  1010. CORRELATION(_inFile, %fieldX%, %fieldY%)
  1011. }
  1012. #SET(needsDelim, 1)
  1013. #SET(corrNamePosY, %corrNamePosY% + LENGTH(%'fieldY'%) + 1)
  1014. #ELSE
  1015. #BREAK
  1016. #END
  1017. #END
  1018. #SET(corrNamePosX, %corrNamePosX% + LENGTH(%'fieldX'%) + 1)
  1019. #ELSE
  1020. #BREAK
  1021. #END
  1022. #END
  1023. ],
  1024. %BaseCorrelationLayout%
  1025. );
  1026. // Append a duplicate of the correlations to itself with the X and Y fields
  1027. // reversed so we can easily merge results on a per-attribute basis later
  1028. #UNIQUENAME(correlations);
  1029. LOCAL %correlations% := %correlations0% + PROJECT
  1030. (
  1031. %correlations0%,
  1032. TRANSFORM
  1033. (
  1034. RECORDOF(LEFT),
  1035. SELF.attribute_x := LEFT.attribute_y,
  1036. SELF.attribute_y := LEFT.attribute_x,
  1037. SELF := LEFT
  1038. )
  1039. );
  1040. // Create a small dataset that specifies the output order of the named
  1041. // attributes (which should be the same as the input order)
  1042. #UNIQUENAME(resultOrderDS);
  1043. LOCAL %resultOrderDS% := DATASET
  1044. (
  1045. [
  1046. #SET(needsDelim, 0)
  1047. #SET(corrNamePosX, 1)
  1048. #SET(fieldY, 1)
  1049. #LOOP
  1050. #SET(fieldX, REGEXFIND('^([^,]+)', %'explicitFields'%[%corrNamePosX%..], 1))
  1051. #IF(%'fieldX'% != '')
  1052. #IF(%needsDelim% = 1) , #END
  1053. {%fieldY%, %'fieldX'%}
  1054. #SET(needsDelim, 1)
  1055. #SET(corrNamePosX, %corrNamePosX% + LENGTH(%'fieldX'%) + 1)
  1056. #SET(fieldY, %fieldY% + 1)
  1057. #ELSE
  1058. #BREAK
  1059. #END
  1060. #END
  1061. ],
  1062. {
  1063. UNSIGNED2 nameOrder,
  1064. %Attribute_t% attrName
  1065. }
  1066. );
  1067. //--------------------------------------------------------------------------
  1068. // Collect individual stats for each attribute; these are grouped by the
  1069. // criteria used to group them
  1070. //--------------------------------------------------------------------------
  1071. // Count data patterns used per attribute; extract the most common and
  1072. // most rare, taking care to not allow the two to overlap; we will
  1073. // replace the '0' character left in from the pattern generation with
  1074. // a '9' character to make the numeric pattern complete
  1075. #UNIQUENAME(dataPatternStats0);
  1076. LOCAL %dataPatternStats0% := PROJECT
  1077. (
  1078. %filledDataInfo%,
  1079. TRANSFORM
  1080. (
  1081. RECORDOF(LEFT),
  1082. SELF.data_pattern := Std.Str.FindReplace(LEFT.data_pattern, '0', '9'),
  1083. SELF := LEFT
  1084. )
  1085. );
  1086. #UNIQUENAME(dataPatternStats);
  1087. LOCAL %dataPatternStats% := TABLE
  1088. (
  1089. DISTRIBUTE(%dataPatternStats0%, HASH32(attribute)),
  1090. {
  1091. attribute,
  1092. data_pattern,
  1093. STRING example := string_value[..%foundMaxPatternLen%],
  1094. UNSIGNED4 rec_count := SUM(GROUP, value_count)
  1095. },
  1096. attribute, data_pattern,
  1097. LOCAL
  1098. ) : ONWARNING(2168, IGNORE);
  1099. #UNIQUENAME(groupedDataPatterns);
  1100. LOCAL %groupedDataPatterns% := GROUP(SORT(%dataPatternStats%, attribute, LOCAL), attribute, LOCAL);
  1101. #UNIQUENAME(topDataPatterns);
  1102. LOCAL %topDataPatterns% := UNGROUP(TOPN(%groupedDataPatterns%, (UNSIGNED)_maxPatterns, -rec_count, data_pattern));
  1103. #UNIQUENAME(rareDataPatterns0);
  1104. LOCAL %rareDataPatterns0% := UNGROUP(TOPN(%groupedDataPatterns%, (UNSIGNED)_maxPatterns, rec_count, data_pattern));
  1105. #UNIQUENAME(rareDataPatterns);
  1106. LOCAL %rareDataPatterns% := JOIN
  1107. (
  1108. %rareDataPatterns0%,
  1109. %topDataPatterns%,
  1110. LEFT.attribute = RIGHT.attribute AND LEFT.data_pattern = RIGHT.data_pattern,
  1111. TRANSFORM(LEFT),
  1112. LEFT ONLY
  1113. ) : ONWARNING(4531, IGNORE);
  1114. // Find min, max and average data lengths per attribute
  1115. #UNIQUENAME(dataLengthStats);
  1116. LOCAL %dataLengthStats% := TABLE
  1117. (
  1118. %filledDataInfo%,
  1119. {
  1120. attribute,
  1121. UNSIGNED4 min_length := MIN(GROUP, data_length),
  1122. UNSIGNED4 max_length := MAX(GROUP, data_length),
  1123. UNSIGNED4 ave_length := SUM(GROUP, data_length * value_count) / SUM(GROUP, value_count)
  1124. },
  1125. attribute,
  1126. MERGE
  1127. );
  1128. // Count attribute fill rates per attribute; will be turned into
  1129. // percentages later
  1130. #UNIQUENAME(dataFilledStats);
  1131. LOCAL %dataFilledStats% := TABLE
  1132. (
  1133. %dataInfo%,
  1134. {
  1135. attribute,
  1136. given_attribute_type,
  1137. UNSIGNED4 rec_count := SUM(GROUP, value_count),
  1138. UNSIGNED4 filled_count := SUM(GROUP, IF(is_filled, value_count, 0))
  1139. },
  1140. attribute, given_attribute_type,
  1141. MERGE
  1142. );
  1143. // Compute the cardinality and pull in previously-computed explicit
  1144. // attribute values at the same time
  1145. #UNIQUENAME(cardinalityAndNumerics);
  1146. LOCAL %cardinalityAndNumerics% := DATASET
  1147. (
  1148. [
  1149. #SET(recLevel, 0)
  1150. #SET(fieldStack, '')
  1151. #SET(namePrefix, '')
  1152. #SET(needsDelim, 0)
  1153. #FOR(inFileFields)
  1154. #FOR(Field)
  1155. #IF(%{@isRecord}% = 1)
  1156. #SET(fieldStack, 'r' + %'fieldStack'%)
  1157. #APPEND(namePrefix, %'@name'% + '.')
  1158. #ELSEIF(%{@isDataset}% = 1)
  1159. #SET(fieldStack, 'd' + %'fieldStack'%)
  1160. #SET(recLevel, %recLevel% + 1)
  1161. #ELSEIF(%{@isEnd}% = 1)
  1162. #IF(%'fieldStack'%[1] = 'd')
  1163. #SET(recLevel, %recLevel% - 1)
  1164. #ELSE
  1165. #SET(namePrefix, REGEXREPLACE('\\w+\\.$', %'namePrefix'%, ''))
  1166. #END
  1167. #SET(fieldStack, %'fieldStack'%[2..])
  1168. #ELSEIF(%recLevel% = 0)
  1169. #IF(%_CanProcessAttribute%(%'namePrefix'% + %'@name'%))
  1170. #IF(%needsDelim% = 1) , #END
  1171. {
  1172. %'namePrefix'% + %'@name'%,
  1173. #IF(%_IsSetType%(%'@type'%))
  1174. FALSE,
  1175. #ELSEIF(REGEXFIND('(integer)|(unsigned)|(decimal)|(real)', %'@type'%))
  1176. TRUE,
  1177. #ELSE
  1178. FALSE,
  1179. #END
  1180. #IF(%FeatureEnabledCardinality%())
  1181. COUNT(#EXPAND(%_MakeAttr%(%'namePrefix'% + %'@name'% + '_uniq_value_recs'))),
  1182. #ELSE
  1183. 0,
  1184. #END
  1185. #IF(%FeatureEnabledMinMax%())
  1186. #EXPAND(%_MakeAttr%(%'namePrefix'% + %'@name'% + '_min')),
  1187. #EXPAND(%_MakeAttr%(%'namePrefix'% + %'@name'% + '_max')),
  1188. #ELSE
  1189. 0,
  1190. 0,
  1191. #END
  1192. #IF(%FeatureEnabledMean%())
  1193. #EXPAND(%_MakeAttr%(%'namePrefix'% + %'@name'% + '_ave')),
  1194. #ELSE
  1195. 0,
  1196. #END
  1197. #IF(%FeatureEnabledStdDev%())
  1198. #EXPAND(%_MakeAttr%(%'namePrefix'% + %'@name'% + '_std_dev')),
  1199. #ELSE
  1200. 0,
  1201. #END
  1202. #IF(%FeatureEnabledQuartiles%())
  1203. #EXPAND(%_MakeAttr%(%'namePrefix'% + %'@name'% + '_q1_value')),
  1204. #EXPAND(%_MakeAttr%(%'namePrefix'% + %'@name'% + '_q2_value')),
  1205. #EXPAND(%_MakeAttr%(%'namePrefix'% + %'@name'% + '_q3_value')),
  1206. #ELSE
  1207. 0,
  1208. 0,
  1209. 0,
  1210. #END
  1211. #IF(%FeatureEnabledModes%())
  1212. #EXPAND(%_MakeAttr%(%'namePrefix'% + %'@name'% + '_mode_values'))(rec_count > 1), // Modes must have more than one instance
  1213. #ELSE
  1214. DATASET([], ModeRec),
  1215. #END
  1216. #IF(%FeatureEnabledLowCardinalityBreakdown%())
  1217. #EXPAND(%_MakeAttr%(%'namePrefix'% + %'@name'% + '_lcb_recs'))
  1218. #ELSE
  1219. DATASET([], ModeRec)
  1220. #END
  1221. }
  1222. #SET(needsDelim, 1)
  1223. #END
  1224. #END
  1225. #END
  1226. #END
  1227. ],
  1228. {
  1229. %Attribute_t% attribute,
  1230. BOOLEAN is_numeric,
  1231. UNSIGNED4 cardinality,
  1232. REAL numeric_min,
  1233. REAL numeric_max,
  1234. REAL numeric_mean,
  1235. REAL numeric_std_dev,
  1236. REAL numeric_lower_quartile,
  1237. REAL numeric_median,
  1238. REAL numeric_upper_quartile,
  1239. DATASET(ModeRec) modes;
  1240. DATASET(ModeRec) cardinality_breakdown;
  1241. }
  1242. );
  1243. //--------------------------------------------------------------------------
  1244. // Collect the individual results into a single output dataset
  1245. //--------------------------------------------------------------------------
  1246. #UNIQUENAME(final10);
  1247. LOCAL %final10% := PROJECT
  1248. (
  1249. %dataFilledStats%,
  1250. TRANSFORM
  1251. (
  1252. _resultLayout,
  1253. SELF.attribute := TRIM(LEFT.attribute, RIGHT),
  1254. SELF.given_attribute_type := TRIM(LEFT.given_attribute_type, RIGHT),
  1255. SELF.rec_count := LEFT.rec_count,
  1256. SELF.fill_rate := #IF(%FeatureEnabledFillRate%()) LEFT.filled_count / LEFT.rec_count * 100 #ELSE 0 #END,
  1257. SELF.fill_count := #IF(%FeatureEnabledFillRate%()) LEFT.filled_count #ELSE 0 #END,
  1258. SELF := []
  1259. )
  1260. );
  1261. #UNIQUENAME(final15);
  1262. LOCAL %final15% :=
  1263. #IF(%FeatureEnabledBestECLTypes%())
  1264. JOIN
  1265. (
  1266. %final10%,
  1267. %attributeBestTypeInfo%,
  1268. LEFT.attribute = RIGHT.attribute,
  1269. TRANSFORM
  1270. (
  1271. RECORDOF(LEFT),
  1272. SELF.best_attribute_type := IF(TRIM(RIGHT.best_attribute_type, RIGHT) != '', TRIM(RIGHT.best_attribute_type, RIGHT), LEFT.given_attribute_type),
  1273. SELF := LEFT
  1274. ),
  1275. LEFT OUTER
  1276. ) : ONWARNING(4531, IGNORE)
  1277. #ELSE
  1278. %final10%
  1279. #END;
  1280. #UNIQUENAME(final20);
  1281. LOCAL %final20% :=
  1282. #IF(%FeatureEnabledLengths%())
  1283. JOIN
  1284. (
  1285. %final15%,
  1286. %dataLengthStats%,
  1287. LEFT.attribute = RIGHT.attribute,
  1288. TRANSFORM
  1289. (
  1290. RECORDOF(LEFT),
  1291. SELF.attribute := LEFT.attribute,
  1292. SELF := RIGHT,
  1293. SELF := LEFT
  1294. ),
  1295. LEFT OUTER, KEEP(1), SMART
  1296. ) : ONWARNING(4531, IGNORE)
  1297. #ELSE
  1298. %final15%
  1299. #END;
  1300. #UNIQUENAME(final30);
  1301. LOCAL %final30% :=
  1302. #IF(%FeatureEnabledCardinality%() OR %FeatureEnabledLowCardinalityBreakdown%() OR %FeatureEnabledMinMax%() OR %FeatureEnabledMean%() OR %FeatureEnabledStdDev%() OR %FeatureEnabledQuartiles%() OR %FeatureEnabledModes%())
  1303. JOIN
  1304. (
  1305. %final20%,
  1306. %cardinalityAndNumerics%,
  1307. LEFT.attribute = RIGHT.attribute,
  1308. TRANSFORM
  1309. (
  1310. RECORDOF(LEFT),
  1311. SELF.attribute := LEFT.attribute,
  1312. SELF := RIGHT,
  1313. SELF := LEFT
  1314. ),
  1315. LEFT OUTER, KEEP(1), SMART
  1316. ) : ONWARNING(4531, IGNORE)
  1317. #ELSE
  1318. %final20%
  1319. #END;
  1320. #UNIQUENAME(final40);
  1321. LOCAL %final40% :=
  1322. #IF(%FeatureEnabledPatterns%())
  1323. DENORMALIZE
  1324. (
  1325. %final30%,
  1326. %topDataPatterns%,
  1327. LEFT.attribute = RIGHT.attribute,
  1328. GROUP,
  1329. TRANSFORM
  1330. (
  1331. RECORDOF(LEFT),
  1332. SELF.popular_patterns := SORT(PROJECT(ROWS(RIGHT), TRANSFORM(PatternCountRec, SELF := LEFT)), -rec_count, data_pattern),
  1333. SELF := LEFT
  1334. ),
  1335. LEFT OUTER, SMART
  1336. ) : ONWARNING(4531, IGNORE)
  1337. #ELSE
  1338. %final30%
  1339. #END;
  1340. #UNIQUENAME(final50);
  1341. LOCAL %final50% :=
  1342. #IF(%FeatureEnabledPatterns%())
  1343. DENORMALIZE
  1344. (
  1345. %final40%,
  1346. %rareDataPatterns%,
  1347. LEFT.attribute = RIGHT.attribute,
  1348. GROUP,
  1349. TRANSFORM
  1350. (
  1351. RECORDOF(LEFT),
  1352. SELF.rare_patterns := SORT(PROJECT(ROWS(RIGHT), TRANSFORM(PatternCountRec, SELF := LEFT)), rec_count, data_pattern),
  1353. SELF := LEFT
  1354. ),
  1355. LEFT OUTER, SMART
  1356. ) : ONWARNING(4531, IGNORE)
  1357. #ELSE
  1358. %final40%
  1359. #END;
  1360. #UNIQUENAME(final60);
  1361. LOCAL %final60% :=
  1362. #IF(%FeatureEnabledCorrelations%())
  1363. DENORMALIZE
  1364. (
  1365. %final50%,
  1366. %correlations%,
  1367. LEFT.attribute = RIGHT.attribute_x,
  1368. GROUP,
  1369. TRANSFORM
  1370. (
  1371. RECORDOF(LEFT),
  1372. SELF.numeric_correlations := SORT
  1373. (
  1374. PROJECT
  1375. (
  1376. ROWS(RIGHT),
  1377. TRANSFORM
  1378. (
  1379. CorrelationRec,
  1380. SELF.attribute := LEFT.attribute_y,
  1381. SELF.corr := LEFT.corr
  1382. )
  1383. ),
  1384. -corr
  1385. ),
  1386. SELF := LEFT
  1387. ),
  1388. LEFT OUTER, SMART
  1389. )
  1390. #ELSE
  1391. %final50%
  1392. #END;
  1393. // Append the attribute order to the results; we will sort on the order
  1394. // when creating the final output
  1395. #UNIQUENAME(final70);
  1396. LOCAL %final70% := JOIN
  1397. (
  1398. %final60%,
  1399. %resultOrderDS%,
  1400. TRIM(LEFT.attribute, LEFT, RIGHT) = RIGHT.attrName,
  1401. TRANSFORM
  1402. (
  1403. RECORDOF(LEFT),
  1404. SELF.sortValue := _sortPrefix + INTFORMAT(RIGHT.nameOrder, 5, 1),
  1405. SELF.attribute := _attrNamePrefix + LEFT.attribute,
  1406. SELF := LEFT
  1407. )
  1408. ) : ONWARNING(4531, IGNORE);
  1409. RETURN #IF(%fieldCount% > 0) %final70% #ELSE DATASET([], _resultLayout) #END;
  1410. ENDMACRO;
  1411. //==========================================================================
  1412. // Call _Inner_Profile() with the given input dataset top-level scalar attributes,
  1413. // then again for each child dataset that has been found; combine the
  1414. // results of all the calls
  1415. #UNIQUENAME(collectedResults);
  1416. LOCAL %collectedResults% :=
  1417. #IF(%'explicitScalarFields'% != '')
  1418. _Inner_Profile
  1419. (
  1420. GLOBAL(%distributedInFile%),
  1421. %'explicitScalarFields'%,
  1422. maxPatterns,
  1423. maxPatternLen,
  1424. %lowCardinalityThreshold%,
  1425. %MAX_MODES%,
  1426. OutputLayout,
  1427. '',
  1428. ''
  1429. )
  1430. #ELSE
  1431. DATASET([], OutputLayout)
  1432. #END
  1433. #UNIQUENAME(dsNameValue)
  1434. #SET(namePos, 1)
  1435. #LOOP
  1436. #SET(dsNameValue, REGEXFIND('^([^,]+)', %'childDSFields'%[%namePos%..], 1))
  1437. #IF(%'dsNameValue'% != '')
  1438. #SET(numValue, REGEXFIND('^(\\d+):', %'dsNameValue'%, 1))
  1439. #SET(nameValue, REGEXFIND(':([^:]+)$', %'dsNameValue'%, 1))
  1440. // The child dataset should have been extracted into its own
  1441. // local attribute; reference it during our call to the inner
  1442. // profile function macro
  1443. #SET(temp, #MANGLE(%'nameValue'%))
  1444. + _Inner_Profile
  1445. (
  1446. GLOBAL(%temp%),
  1447. '',
  1448. maxPatterns,
  1449. maxPatternLen,
  1450. %lowCardinalityThreshold%,
  1451. %MAX_MODES%,
  1452. OutputLayout,
  1453. %'nameValue'% + '.',
  1454. INTFORMAT(%numValue%, 5, 1) + '.'
  1455. )
  1456. #SET(namePos, %namePos% + LENGTH(%'dsNameValue'%) + 1)
  1457. #ELSE
  1458. #BREAK
  1459. #END
  1460. #END;
  1461. // Put the combined _Inner_Profile() results in the right order and layout
  1462. #UNIQUENAME(finalData);
  1463. LOCAL %finalData% := PROJECT(SORT(%collectedResults%, sortValue), %FinalOutputLayout%);
  1464. RETURN %finalData%;
  1465. ENDMACRO;