BestRecordStructure.ecl 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472
  1. /***
  2. * Function macro that leverages DataPatterns to return a string defining the
  3. * best ECL record structure for the input data.
  4. *
  5. * @param inFile The dataset to process; REQUIRED
  6. * @param sampling A positive integer representing a percentage of
  7. * inFile to examine, which is useful when analyzing a
  8. * very large dataset and only an estimatation is
  9. * sufficient; valid range for this argument is
  10. * 1-100; values outside of this range will be
  11. * clamped; OPTIONAL, defaults to 100 (which indicates
  12. * that the entire dataset will be analyzed)
  13. * @param emitTransform Boolean governing whether the function emits a
  14. * TRANSFORM function that could be used to rewrite
  15. * the dataset into the 'best' record definition;
  16. * OPTIONAL, defaults to FALSE.
  17. * @param textOutput Boolean governing the type of result that is
  18. * delivered by this function; if FALSE then a
  19. * recordset of STRINGs will be returned; if TRUE
  20. * then a dataset with a single STRING field, with
  21. * the contents formatted for HTML, will be
  22. * returned (this is the ideal output if the
  23. * intention is to copy the output from ECL Watch);
  24. * OPTIONAL, defaults to FALSE
  25. *
  26. * @return A recordset defining the best ECL record structure for the data.
  27. * If textOutput is FALSE (the default) then each record will contain
  28. * one field declaration, and the list of declarations will be wrapped
  29. * with RECORD and END strings; if the emitTransform argument was
  30. * TRUE, there will also be a set of records that that comprise a
  31. * stand-alone TRANSFORM function. If textOutput is TRUE then only
  32. * one record will be returned, containing an HTML-formatted string
  33. * containing the new field declarations (and optionally the
  34. * TRANSFORM); this is the ideal format if the intention is to copy
  35. * the result from ECL Watch.
  36. */
  37. EXPORT BestRecordStructure(inFile, sampling = 100, emitTransform = FALSE, textOutput = FALSE) := FUNCTIONMACRO
  38. LOADXML('<xml/>');
  39. #EXPORTXML(bestInFileFields, RECORDOF(inFile));
  40. #UNIQUENAME(bestFieldStack);
  41. #UNIQUENAME(bestStructType);
  42. #UNIQUENAME(bestLayoutType);
  43. #UNIQUENAME(bestCapturedPos);
  44. #UNIQUENAME(bestPrevCapturedPos);
  45. #UNIQUENAME(bestLayoutName);
  46. #UNIQUENAME(bestNeedsDelim);
  47. #UNIQUENAME(bestNamePrefix);
  48. #UNIQUENAME(recLevel);
  49. IMPORT Std;
  50. // Attribute naming note: In order to reduce symbol collisions with calling
  51. // code, all LOCAL attributes are prefixed with two underscore characters;
  52. // normally, a #UNIQUENAME would be used instead, but there is apparently
  53. // a problem with using that for ECL attributes when another function
  54. // macro is called (namely, Profile); using double underscores is not an
  55. // optimal solution but the chance of symbol collision should at least be
  56. // reduced
  57. LOCAL __DATAREC_NAME := 'DataRec';
  58. LOCAL __LAYOUT_NAME := 'Layout';
  59. LOCAL __StringRec := {STRING s};
  60. // Helper function for determining if old and new data types need
  61. // explicit type casting
  62. LOCAL __NeedCoercion(STRING oldType, STRING newType) := FUNCTION
  63. GenericType(STRING theType) := MAP
  64. (
  65. theType[..6] = 'string' => 'string',
  66. theType[..13] = 'ebcdic string' => 'string',
  67. theType[..7] = 'qstring' => 'string',
  68. theType[..9] = 'varstring' => 'string',
  69. theType[..3] = 'utf' => 'string',
  70. theType[..7] = 'unicode' => 'string',
  71. theType[..10] = 'varunicode' => 'string',
  72. theType[..4] = 'data' => 'data',
  73. theType[..7] = 'boolean' => 'boolean',
  74. theType[..7] = 'integer' => 'numeric',
  75. theType[..18] = 'big_endian integer' => 'numeric',
  76. theType[..4] = 'real' => 'numeric',
  77. theType[..7] = 'decimal' => 'numeric',
  78. theType[..8] = 'udecimal' => 'numeric',
  79. theType[..8] = 'unsigned' => 'numeric',
  80. theType[..19] = 'big_endian unsigned' => 'numeric',
  81. theType
  82. );
  83. oldGenericType := GenericType(Std.Str.ToLowerCase(oldType));
  84. newGenericType := GenericType(Std.Str.ToLowerCase(newType));
  85. RETURN oldGenericType != newGenericType;
  86. END;
  87. // Build a dataset containing information about embedded records and
  88. // child datasets; we need to track the beginning and ending positions
  89. // of the fields defined within those structures
  90. LOCAL __ChildRecInfoLayout := RECORD
  91. STRING layoutType;
  92. STRING layoutName;
  93. STRING fieldName;
  94. UNSIGNED2 startPos;
  95. UNSIGNED2 endPos;
  96. END;
  97. LOCAL __childRecInfo := DATASET
  98. (
  99. [
  100. #SET(bestFieldStack, '')
  101. #SET(bestNeedsDelim, 0)
  102. #FOR(bestInFileFields)
  103. #FOR(Field)
  104. #IF(%{@isRecord}% = 1 OR %{@isDataset}% = 1)
  105. #IF(%{@isRecord}% = 1)
  106. #SET(bestStructType, 'r')
  107. #ELSE
  108. #SET(bestStructType, 'd')
  109. #END
  110. #IF(%'bestFieldStack'% != '')
  111. #SET(bestFieldStack, ';' + %'bestFieldStack'%)
  112. #END
  113. #SET(bestFieldStack, %'bestStructType'% + ':' + %'@position'% + ':' + %'@ecltype'% + %'bestFieldStack'%)
  114. #ELSEIF(%{@isEnd}% = 1)
  115. #SET(bestLayoutType, %'bestFieldStack'%[1])
  116. #SET(bestCapturedPos, REGEXFIND('.:(\\d+)', %'bestFieldStack'%, 1))
  117. #SET(bestLayoutName, REGEXFIND('.:\\d+:([^;]+)', %'bestFieldStack'%, 1))
  118. #SET(bestFieldStack, REGEXFIND('^[^;]+;(.*)', %'bestFieldStack'%, 1))
  119. #IF(%bestNeedsDelim% = 1) , #END
  120. {
  121. %'bestLayoutType'%,
  122. %'bestLayoutName'%,
  123. %'@name'%,
  124. %bestCapturedPos%,
  125. %bestPrevCapturedPos%
  126. }
  127. #SET(bestNeedsDelim, 1)
  128. #ELSE
  129. #SET(bestPrevCapturedPos, %@position%)
  130. #END
  131. #END
  132. #END
  133. ],
  134. __ChildRecInfoLayout
  135. );
  136. // Extract the original data type and position of the fields within the
  137. // input dataset
  138. LOCAL __FieldInfoLayout := RECORD
  139. STRING eclType;
  140. STRING name;
  141. STRING fullName;
  142. BOOLEAN isRecord;
  143. BOOLEAN isDataset;
  144. UNSIGNED2 depth;
  145. UNSIGNED2 position;
  146. END;
  147. LOCAL __fieldInfo0 := DATASET
  148. (
  149. [
  150. #SET(bestFieldStack, '')
  151. #SET(bestNeedsDelim, 0)
  152. #SET(bestNamePrefix, '')
  153. #SET(recLevel, 0)
  154. #FOR(bestInFileFields)
  155. #FOR(Field)
  156. #IF(%@isEnd% != 1)
  157. #IF(%bestNeedsDelim% = 1) , #END
  158. {
  159. %'@ecltype'%,
  160. %'@name'%,
  161. %'bestNamePrefix'% + %'@name'%,
  162. #IF(%@isRecord% = 1) TRUE #ELSE FALSE #END,
  163. #IF(%@isDataset% = 1) TRUE #ELSE FALSE #END,
  164. %recLevel%,
  165. %@position%
  166. }
  167. #SET(bestNeedsDelim, 1)
  168. #END
  169. #IF(%{@isRecord}% = 1 OR %{@isDataset}% = 1)
  170. #APPEND(bestNamePrefix, %'@name'% + '.')
  171. #SET(recLevel, %recLevel% + 1)
  172. #ELSEIF(%{@isEnd}% = 1)
  173. #SET(bestNamePrefix, REGEXREPLACE('\\w+\\.$', %'bestNamePrefix'%, ''))
  174. #SET(recLevel, %recLevel% - 1)
  175. #END
  176. #END
  177. #END
  178. ],
  179. __FieldInfoLayout
  180. );
  181. // Attach the record end positions for embedded records and child datasets
  182. LOCAL __fieldInfo10 := JOIN
  183. (
  184. __fieldInfo0,
  185. __childRecInfo,
  186. LEFT.name = RIGHT.fieldName AND LEFT.position = RIGHT.startPos,
  187. TRANSFORM
  188. (
  189. {
  190. RECORDOF(LEFT),
  191. UNSIGNED2 endPosition
  192. },
  193. SELF.endPosition := RIGHT.endPos,
  194. SELF := LEFT
  195. ),
  196. LEFT OUTER
  197. );
  198. // Get the best data types from the Profile() function
  199. LOCAL __patternRes := Std.DataPatterns.Profile(inFile, features := 'best_ecl_types', sampleSize := sampling);
  200. // Append the derived 'best' data types to the field information we
  201. // already collected
  202. LOCAL __fieldInfo15 := JOIN
  203. (
  204. __fieldInfo10,
  205. __patternRes,
  206. LEFT.fullName = RIGHT.attribute,
  207. TRANSFORM
  208. (
  209. {
  210. RECORDOF(LEFT),
  211. STRING bestAttributeType
  212. },
  213. SELF.bestAttributeType := IF(RIGHT.best_attribute_type != '', Std.Str.ToUpperCase(RIGHT.best_attribute_type), LEFT.eclType),
  214. SELF := LEFT
  215. ),
  216. LEFT OUTER
  217. );
  218. // Determine fields that must have explicit coercion if we are supplying
  219. // transform information
  220. LOCAL __fieldInfo20 := PROJECT
  221. (
  222. __fieldInfo15,
  223. TRANSFORM
  224. (
  225. {
  226. RECORDOF(LEFT),
  227. STRING bestAssignment
  228. },
  229. shouldRewriteType := ((LEFT.isDataset OR LEFT.isRecord) AND LEFT.bestAttributeType IN ['<unnamed>', 'table of <unnamed>']);
  230. tempDSName := __DATAREC_NAME + '_' + INTFORMAT(COUNTER, 4, 1);
  231. SELF.eclType := IF(NOT shouldRewriteType, Std.Str.ToUpperCase(LEFT.eclType), tempDSName),
  232. SELF.bestAttributeType := IF(NOT shouldRewriteType, LEFT.bestAttributeType, tempDSName),
  233. SELF.bestAssignment := IF
  234. (
  235. __NeedCoercion(SELF.eclType, SELF.bestAttributeType),
  236. ' SELF.' + LEFT.name + ' := (' + Std.Str.ToUppercase(SELF.bestAttributeType) + ')r.' + LEFT.name + ';',
  237. ''
  238. ),
  239. SELF := LEFT
  240. )
  241. );
  242. LOCAL __LayoutItems := RECORD(__StringRec)
  243. STRING fullName {DEFAULT('')};
  244. STRING bestAssignment {DEFAULT('')};
  245. END;
  246. LOCAL __ChildRecLayout := RECORD
  247. STRING layoutName;
  248. UNSIGNED2 startPos;
  249. UNSIGNED2 endPos;
  250. UNSIGNED2 depth;
  251. DATASET(__LayoutItems) items;
  252. END;
  253. // Function for creating ECL TRANSFORM assignment statements
  254. LOCAL __MakeRecDefinition(DATASET(RECORDOF(__fieldInfo20)) ds, STRING layoutName, BOOLEAN useBest = TRUE) := FUNCTION
  255. displayPrefix := IF(useBest, 'New', 'Old');
  256. displayedLayoutName := displayPrefix + layoutName;
  257. RETURN (+)
  258. (
  259. DATASET([{displayedLayoutName + ' := RECORD'}], __LayoutItems),
  260. PROJECT
  261. (
  262. SORT(DISTRIBUTE(ds, 0), position, LOCAL),
  263. TRANSFORM
  264. (
  265. __LayoutItems,
  266. attrType := IF(useBest, LEFT.bestAttributeType, LEFT.eclType);
  267. attrPrefix := IF(LEFT.isDataset OR LEFT.isRecord, displayPrefix, '');
  268. fullAttrType := attrPrefix + attrType;
  269. namedDataType := IF(NOT LEFT.isDataset, fullAttrType, 'DATASET(' + fullAttrType + ')');
  270. SELF.s := ' ' + namedDataType + ' ' + LEFT.name + ';',
  271. SELF.bestAssignment := MAP
  272. (
  273. LEFT.bestAssignment != '' => LEFT.bestAssignment,
  274. LEFT.isRecord => ' SELF.' + LEFT.name + ' := ROW(Make_' + fullAttrType + '(r.' + LEFT.name + '));',
  275. LEFT.isDataset => ' SELF.' + LEFT.name + ' := PROJECT(r.' + LEFT.name + ', Make_' + fullAttrType + '(LEFT));',
  276. ''
  277. ),
  278. SELF := LEFT
  279. )
  280. ),
  281. DATASET([{'END;'}], __LayoutItems),
  282. ORDERED(TRUE)
  283. );
  284. END;
  285. // Iteratively process embedded records and child dataset definitions,
  286. // extracting each into its own record
  287. LOCAL __ProcessChildRecs(DATASET(__ChildRecLayout) layoutDS, UNSIGNED2 aDepth, BOOLEAN useBest = TRUE) := FUNCTION
  288. __bestNamedChildRecs := DENORMALIZE
  289. (
  290. __fieldInfo20(depth = (aDepth - 1) AND (isRecord OR isDataset)),
  291. __fieldInfo20(depth = aDepth),
  292. RIGHT.position BETWEEN LEFT.position + 1 AND LEFT.endPosition,
  293. GROUP,
  294. TRANSFORM
  295. (
  296. __ChildRecLayout,
  297. SELF.layoutName := LEFT.bestAttributeType,
  298. SELF.items := __MakeRecDefinition(ROWS(RIGHT), SELF.layoutName, useBest),
  299. SELF.startPos := LEFT.position,
  300. SELF.endPos := LEFT.endPosition,
  301. SELF.depth := aDepth,
  302. SELF := LEFT
  303. ),
  304. ALL, ORDERED(TRUE)
  305. ) : ONWARNING(4531, IGNORE);
  306. RETURN layoutDS + __bestNamedChildRecs;
  307. END;
  308. // Create a list of embedded records and child dataset definitions for the
  309. // original input dataset
  310. LOCAL __oldNamedChildRecs0 := LOOP
  311. (
  312. DATASET([], __ChildRecLayout),
  313. MAX(__fieldInfo20, depth),
  314. __ProcessChildRecs(ROWS(LEFT), MAX(__fieldInfo20, depth) + 1 - COUNTER, FALSE)
  315. );
  316. LOCAL __oldNamedChildRecs := SORT(__oldNamedChildRecs0, endPos, -startPos);
  317. LOCAL __topLevelOldRecDef := DATASET
  318. (
  319. [
  320. {
  321. __LAYOUT_NAME,
  322. 0,
  323. 0,
  324. 0,
  325. __MakeRecDefinition(__fieldInfo20(depth = 0), __LAYOUT_NAME, FALSE)
  326. }
  327. ],
  328. __ChildRecLayout
  329. );
  330. LOCAL __allOldRecDefs := __oldNamedChildRecs & __topLevelOldRecDef;
  331. // Create a list of embedded records and child dataset definitions using the
  332. // the recommended ECL datatypes
  333. LOCAL __bestNamedChildRecs0 := LOOP
  334. (
  335. DATASET([], __ChildRecLayout),
  336. MAX(__fieldInfo20, depth),
  337. __ProcessChildRecs(ROWS(LEFT), MAX(__fieldInfo20, depth) + 1 - COUNTER, TRUE)
  338. );
  339. LOCAL __bestNamedChildRecs := SORT(__bestNamedChildRecs0, endPos, -startPos);
  340. LOCAL __topLevelBestRecDef := DATASET
  341. (
  342. [
  343. {
  344. __LAYOUT_NAME,
  345. 0,
  346. 0,
  347. 0,
  348. __MakeRecDefinition(__fieldInfo20(depth = 0), __LAYOUT_NAME, TRUE)
  349. }
  350. ],
  351. __ChildRecLayout
  352. );
  353. LOCAL __allBestRecDefs := __bestNamedChildRecs & __topLevelBestRecDef;
  354. // Creates an ECL TRANSFORM function based on the collected information
  355. // about a record definition
  356. LOCAL __MakeTransforms(__ChildRecLayout recInfo) := FUNCTION
  357. RETURN (+)
  358. (
  359. DATASET(['New' + recInfo.layoutName + ' Make_New' + recInfo.layoutName + '(Old' + recInfo.layoutName + ' r) := TRANSFORM'], __StringRec),
  360. PROJECT
  361. (
  362. DISTRIBUTE(recInfo.items, 0),
  363. TRANSFORM
  364. (
  365. __StringRec,
  366. assignment := LEFT.bestAssignment;
  367. SELF.s := IF(assignment != '', assignment, SKIP)
  368. )
  369. ),
  370. DATASET([' SELF := r;'], __StringRec),
  371. DATASET(['END;'], __StringRec),
  372. ORDERED(TRUE)
  373. );
  374. END;
  375. LOCAL __allTransforms := PROJECT
  376. (
  377. __allBestRecDefs,
  378. TRANSFORM
  379. (
  380. {
  381. DATASET(__StringRec) lines
  382. },
  383. SELF.lines := __MakeTransforms(LEFT)
  384. )
  385. );
  386. // Create a dataset of STRINGS that contain record definitions for the
  387. // input dataset, TRANSFORMs for converting between the old and new
  388. // definitions, and a sample PROJECT for kicking it all off
  389. LOCAL __conditionalBR := #IF((BOOLEAN)textOutput) '<br/>' #ELSE '' #END;
  390. LOCAL __oldRecDefsPlusTransforms := (+)
  391. (
  392. DATASET(['//----------' + __conditionalBR], __StringRec),
  393. PROJECT(__allOldRecDefs.items, __StringRec),
  394. DATASET(['//----------' + __conditionalBR], __StringRec),
  395. __allTransforms.lines,
  396. DATASET(['//----------' + __conditionalBR], __StringRec),
  397. DATASET(['oldDS := DATASET([], OldLayout);' + __conditionalBR], __StringRec),
  398. DATASET(['newDS := PROJECT(oldDS, Make_NewLayout(LEFT));' + __conditionalBR], __StringRec),
  399. ORDERED(TRUE)
  400. );
  401. // Combine old definitions and transforms conditionally
  402. LOCAL __conditionalOldStuff :=
  403. #IF((BOOLEAN)emitTransform)
  404. __oldRecDefsPlusTransforms
  405. #ELSE
  406. DATASET([], __StringRec)
  407. #END;
  408. LOCAL __allOutput := PROJECT(__allBestRecDefs.items, __StringRec) & __conditionalOldStuff;
  409. // Roll everything up to one string with HTML line breaks
  410. LOCAL __htmlString := ROLLUP
  411. (
  412. __allOutput,
  413. TRUE,
  414. TRANSFORM
  415. (
  416. RECORDOF(LEFT),
  417. rightString := IF(RIGHT.s = 'END;', RIGHT.s + '<br/>', RIGHT.s);
  418. SELF.s := LEFT.s + '<br/>' + rightString
  419. )
  420. );
  421. // Stuff the HTML result into a single record, wrapped with <pre> so it
  422. // looks right in the browser
  423. LOCAL __htmlResult := DATASET(['<pre>' + __htmlString[1].s + '</pre>'], {STRING result__html});
  424. // Choose the result (dataset with each line a string, or a text blob)
  425. LOCAL __finalResult := #IF((BOOLEAN)textOutput) __htmlResult #ELSE __allOutput #END;
  426. RETURN __finalResult;
  427. ENDMACRO;