BestRecordStructure.ecl 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473
  1. /***
  2. * Function macro that leverages DataPatterns to return a string defining the
  3. * best ECL record structure for the input data.
  4. *
  5. * @param inFile The dataset to process; REQUIRED
  6. * @param sampling A positive integer representing a percentage of
  7. * inFile to examine, which is useful when analyzing a
  8. * very large dataset and only an estimatation is
  9. * sufficient; valid range for this argument is
  10. * 1-100; values outside of this range will be
  11. * clamped; OPTIONAL, defaults to 100 (which indicates
  12. * that the entire dataset will be analyzed)
  13. * @param emitTransform Boolean governing whether the function emits a
  14. * TRANSFORM function that could be used to rewrite
  15. * the dataset into the 'best' record definition;
  16. * OPTIONAL, defaults to FALSE.
  17. * @param textOutput Boolean governing the type of result that is
  18. * delivered by this function; if FALSE then a
  19. * recordset of STRINGs will be returned; if TRUE
  20. * then a dataset with a single STRING field, with
  21. * the contents formatted for HTML, will be
  22. * returned (this is the ideal output if the
  23. * intention is to copy the output from ECL Watch);
  24. * OPTIONAL, defaults to FALSE
  25. *
  26. * @return A recordset defining the best ECL record structure for the data.
  27. * If textOutput is FALSE (the default) then each record will contain
  28. * one field declaration, and the list of declarations will be wrapped
  29. * with RECORD and END strings; if the emitTransform argument was
  30. * TRUE, there will also be a set of records that that comprise a
  31. * stand-alone TRANSFORM function. If textOutput is TRUE then only
  32. * one record will be returned, containing an HTML-formatted string
  33. * containing the new field declarations (and optionally the
  34. * TRANSFORM); this is the ideal format if the intention is to copy
  35. * the result from ECL Watch.
  36. */
  37. EXPORT BestRecordStructure(inFile, sampling = 100, emitTransform = FALSE, textOutput = FALSE) := FUNCTIONMACRO
  38. LOADXML('<xml/>');
  39. #EXPORTXML(bestInFileFields, RECORDOF(inFile));
  40. #UNIQUENAME(bestFieldStack);
  41. #UNIQUENAME(bestStructType);
  42. #UNIQUENAME(bestLayoutType);
  43. #UNIQUENAME(bestCapturedPos);
  44. #UNIQUENAME(bestPrevCapturedPos);
  45. #UNIQUENAME(bestLayoutName);
  46. #UNIQUENAME(bestNeedsDelim);
  47. #UNIQUENAME(bestNamePrefix);
  48. #UNIQUENAME(recLevel);
  49. IMPORT DataPatterns;
  50. IMPORT Std;
  51. // Attribute naming note: In order to reduce symbol collisions with calling
  52. // code, all LOCAL attributes are prefixed with two underscore characters;
  53. // normally, a #UNIQUENAME would be used instead, but there is apparently
  54. // a problem with using that for ECL attributes when another function
  55. // macro is called (namely, Profile); using double underscores is not an
  56. // optimal solution but the chance of symbol collision should at least be
  57. // reduced
  58. LOCAL __DATAREC_NAME := 'DataRec';
  59. LOCAL __LAYOUT_NAME := 'Layout';
  60. LOCAL __StringRec := {STRING s};
  61. // Helper function for determining if old and new data types need
  62. // explicit type casting
  63. LOCAL __NeedCoercion(STRING oldType, STRING newType) := FUNCTION
  64. GenericType(STRING theType) := MAP
  65. (
  66. theType[..6] = 'string' => 'string',
  67. theType[..13] = 'ebcdic string' => 'string',
  68. theType[..7] = 'qstring' => 'string',
  69. theType[..9] = 'varstring' => 'string',
  70. theType[..3] = 'utf' => 'string',
  71. theType[..7] = 'unicode' => 'string',
  72. theType[..10] = 'varunicode' => 'string',
  73. theType[..4] = 'data' => 'data',
  74. theType[..7] = 'boolean' => 'boolean',
  75. theType[..7] = 'integer' => 'numeric',
  76. theType[..18] = 'big_endian integer' => 'numeric',
  77. theType[..4] = 'real' => 'numeric',
  78. theType[..7] = 'decimal' => 'numeric',
  79. theType[..8] = 'udecimal' => 'numeric',
  80. theType[..8] = 'unsigned' => 'numeric',
  81. theType[..19] = 'big_endian unsigned' => 'numeric',
  82. theType
  83. );
  84. oldGenericType := GenericType(Std.Str.ToLowerCase(oldType));
  85. newGenericType := GenericType(Std.Str.ToLowerCase(newType));
  86. RETURN oldGenericType != newGenericType;
  87. END;
  88. // Build a dataset containing information about embedded records and
  89. // child datasets; we need to track the beginning and ending positions
  90. // of the fields defined within those structures
  91. LOCAL __ChildRecInfoLayout := RECORD
  92. STRING layoutType;
  93. STRING layoutName;
  94. STRING fieldName;
  95. UNSIGNED2 startPos;
  96. UNSIGNED2 endPos;
  97. END;
  98. LOCAL __childRecInfo := DATASET
  99. (
  100. [
  101. #SET(bestFieldStack, '')
  102. #SET(bestNeedsDelim, 0)
  103. #FOR(bestInFileFields)
  104. #FOR(Field)
  105. #IF(%{@isRecord}% = 1 OR %{@isDataset}% = 1)
  106. #IF(%{@isRecord}% = 1)
  107. #SET(bestStructType, 'r')
  108. #ELSE
  109. #SET(bestStructType, 'd')
  110. #END
  111. #IF(%'bestFieldStack'% != '')
  112. #SET(bestFieldStack, ';' + %'bestFieldStack'%)
  113. #END
  114. #SET(bestFieldStack, %'bestStructType'% + ':' + %'@position'% + ':' + %'@ecltype'% + %'bestFieldStack'%)
  115. #ELSEIF(%{@isEnd}% = 1)
  116. #SET(bestLayoutType, %'bestFieldStack'%[1])
  117. #SET(bestCapturedPos, REGEXFIND('.:(\\d+)', %'bestFieldStack'%, 1))
  118. #SET(bestLayoutName, REGEXFIND('.:\\d+:([^;]+)', %'bestFieldStack'%, 1))
  119. #SET(bestFieldStack, REGEXFIND('^[^;]+;(.*)', %'bestFieldStack'%, 1))
  120. #IF(%bestNeedsDelim% = 1) , #END
  121. {
  122. %'bestLayoutType'%,
  123. %'bestLayoutName'%,
  124. %'@name'%,
  125. %bestCapturedPos%,
  126. %bestPrevCapturedPos%
  127. }
  128. #SET(bestNeedsDelim, 1)
  129. #ELSE
  130. #SET(bestPrevCapturedPos, %@position%)
  131. #END
  132. #END
  133. #END
  134. ],
  135. __ChildRecInfoLayout
  136. );
  137. // Extract the original data type and position of the fields within the
  138. // input dataset
  139. LOCAL __FieldInfoLayout := RECORD
  140. STRING eclType;
  141. STRING name;
  142. STRING fullName;
  143. BOOLEAN isRecord;
  144. BOOLEAN isDataset;
  145. UNSIGNED2 depth;
  146. UNSIGNED2 position;
  147. END;
  148. LOCAL __fieldInfo0 := DATASET
  149. (
  150. [
  151. #SET(bestFieldStack, '')
  152. #SET(bestNeedsDelim, 0)
  153. #SET(bestNamePrefix, '')
  154. #SET(recLevel, 0)
  155. #FOR(bestInFileFields)
  156. #FOR(Field)
  157. #IF(%@isEnd% != 1)
  158. #IF(%bestNeedsDelim% = 1) , #END
  159. {
  160. %'@ecltype'%,
  161. %'@name'%,
  162. %'bestNamePrefix'% + %'@name'%,
  163. #IF(%@isRecord% = 1) TRUE #ELSE FALSE #END,
  164. #IF(%@isDataset% = 1) TRUE #ELSE FALSE #END,
  165. %recLevel%,
  166. %@position%
  167. }
  168. #SET(bestNeedsDelim, 1)
  169. #END
  170. #IF(%{@isRecord}% = 1 OR %{@isDataset}% = 1)
  171. #APPEND(bestNamePrefix, %'@name'% + '.')
  172. #SET(recLevel, %recLevel% + 1)
  173. #ELSEIF(%{@isEnd}% = 1)
  174. #SET(bestNamePrefix, REGEXREPLACE('\\w+\\.$', %'bestNamePrefix'%, ''))
  175. #SET(recLevel, %recLevel% - 1)
  176. #END
  177. #END
  178. #END
  179. ],
  180. __FieldInfoLayout
  181. );
  182. // Attach the record end positions for embedded records and child datasets
  183. LOCAL __fieldInfo10 := JOIN
  184. (
  185. __fieldInfo0,
  186. __childRecInfo,
  187. LEFT.name = RIGHT.fieldName AND LEFT.position = RIGHT.startPos,
  188. TRANSFORM
  189. (
  190. {
  191. RECORDOF(LEFT),
  192. UNSIGNED2 endPosition
  193. },
  194. SELF.endPosition := RIGHT.endPos,
  195. SELF := LEFT
  196. ),
  197. LEFT OUTER
  198. );
  199. // Get the best data types from the Profile() function
  200. LOCAL __patternRes := DataPatterns.Profile(inFile, features := 'best_ecl_types', sampleSize := sampling);
  201. // Append the derived 'best' data types to the field information we
  202. // already collected
  203. LOCAL __fieldInfo15 := JOIN
  204. (
  205. __fieldInfo10,
  206. __patternRes,
  207. LEFT.fullName = RIGHT.attribute,
  208. TRANSFORM
  209. (
  210. {
  211. RECORDOF(LEFT),
  212. STRING bestAttributeType
  213. },
  214. SELF.bestAttributeType := IF(RIGHT.best_attribute_type != '', Std.Str.ToUpperCase(RIGHT.best_attribute_type), LEFT.eclType),
  215. SELF := LEFT
  216. ),
  217. LEFT OUTER
  218. );
  219. // Determine fields that must have explicit coercion if we are supplying
  220. // transform information
  221. LOCAL __fieldInfo20 := PROJECT
  222. (
  223. __fieldInfo15,
  224. TRANSFORM
  225. (
  226. {
  227. RECORDOF(LEFT),
  228. STRING bestAssignment
  229. },
  230. shouldRewriteType := ((LEFT.isDataset OR LEFT.isRecord) AND LEFT.bestAttributeType IN ['<unnamed>', 'table of <unnamed>']);
  231. tempDSName := __DATAREC_NAME + '_' + INTFORMAT(COUNTER, 4, 1);
  232. SELF.eclType := IF(NOT shouldRewriteType, Std.Str.ToUpperCase(LEFT.eclType), tempDSName),
  233. SELF.bestAttributeType := IF(NOT shouldRewriteType, LEFT.bestAttributeType, tempDSName),
  234. SELF.bestAssignment := IF
  235. (
  236. __NeedCoercion(SELF.eclType, SELF.bestAttributeType),
  237. ' SELF.' + LEFT.name + ' := (' + Std.Str.ToUppercase(SELF.bestAttributeType) + ')r.' + LEFT.name + ';',
  238. ''
  239. ),
  240. SELF := LEFT
  241. )
  242. );
  243. LOCAL __LayoutItems := RECORD(__StringRec)
  244. STRING fullName {DEFAULT('')};
  245. STRING bestAssignment {DEFAULT('')};
  246. END;
  247. LOCAL __ChildRecLayout := RECORD
  248. STRING layoutName;
  249. UNSIGNED2 startPos;
  250. UNSIGNED2 endPos;
  251. UNSIGNED2 depth;
  252. DATASET(__LayoutItems) items;
  253. END;
  254. // Function for creating ECL TRANSFORM assignment statements
  255. LOCAL __MakeRecDefinition(DATASET(RECORDOF(__fieldInfo20)) ds, STRING layoutName, BOOLEAN useBest = TRUE) := FUNCTION
  256. displayPrefix := IF(useBest, 'New', 'Old');
  257. displayedLayoutName := displayPrefix + layoutName;
  258. RETURN (+)
  259. (
  260. DATASET([{displayedLayoutName + ' := RECORD'}], __LayoutItems),
  261. PROJECT
  262. (
  263. SORT(DISTRIBUTE(ds, 0), position, LOCAL),
  264. TRANSFORM
  265. (
  266. __LayoutItems,
  267. attrType := IF(useBest, LEFT.bestAttributeType, LEFT.eclType);
  268. attrPrefix := IF(LEFT.isDataset OR LEFT.isRecord, displayPrefix, '');
  269. fullAttrType := attrPrefix + attrType;
  270. namedDataType := IF(NOT LEFT.isDataset, fullAttrType, 'DATASET(' + fullAttrType + ')');
  271. SELF.s := ' ' + namedDataType + ' ' + LEFT.name + ';',
  272. SELF.bestAssignment := MAP
  273. (
  274. LEFT.bestAssignment != '' => LEFT.bestAssignment,
  275. LEFT.isRecord => ' SELF.' + LEFT.name + ' := ROW(Make_' + fullAttrType + '(r.' + LEFT.name + '));',
  276. LEFT.isDataset => ' SELF.' + LEFT.name + ' := PROJECT(r.' + LEFT.name + ', Make_' + fullAttrType + '(LEFT));',
  277. ''
  278. ),
  279. SELF := LEFT
  280. )
  281. ),
  282. DATASET([{'END;'}], __LayoutItems),
  283. ORDERED(TRUE)
  284. );
  285. END;
  286. // Iteratively process embedded records and child dataset definitions,
  287. // extracting each into its own record
  288. LOCAL __ProcessChildRecs(DATASET(__ChildRecLayout) layoutDS, UNSIGNED2 aDepth, BOOLEAN useBest = TRUE) := FUNCTION
  289. __bestNamedChildRecs := DENORMALIZE
  290. (
  291. __fieldInfo20(depth = (aDepth - 1) AND (isRecord OR isDataset)),
  292. __fieldInfo20(depth = aDepth),
  293. RIGHT.position BETWEEN LEFT.position + 1 AND LEFT.endPosition,
  294. GROUP,
  295. TRANSFORM
  296. (
  297. __ChildRecLayout,
  298. SELF.layoutName := LEFT.bestAttributeType,
  299. SELF.items := __MakeRecDefinition(ROWS(RIGHT), SELF.layoutName, useBest),
  300. SELF.startPos := LEFT.position,
  301. SELF.endPos := LEFT.endPosition,
  302. SELF.depth := aDepth,
  303. SELF := LEFT
  304. ),
  305. ALL, ORDERED(TRUE)
  306. ) : ONWARNING(4531, IGNORE);
  307. RETURN layoutDS + __bestNamedChildRecs;
  308. END;
  309. // Create a list of embedded records and child dataset definitions for the
  310. // original input dataset
  311. LOCAL __oldNamedChildRecs0 := LOOP
  312. (
  313. DATASET([], __ChildRecLayout),
  314. MAX(__fieldInfo20, depth),
  315. __ProcessChildRecs(ROWS(LEFT), MAX(__fieldInfo20, depth) + 1 - COUNTER, FALSE)
  316. );
  317. LOCAL __oldNamedChildRecs := SORT(__oldNamedChildRecs0, endPos, -startPos);
  318. LOCAL __topLevelOldRecDef := DATASET
  319. (
  320. [
  321. {
  322. __LAYOUT_NAME,
  323. 0,
  324. 0,
  325. 0,
  326. __MakeRecDefinition(__fieldInfo20(depth = 0), __LAYOUT_NAME, FALSE)
  327. }
  328. ],
  329. __ChildRecLayout
  330. );
  331. LOCAL __allOldRecDefs := __oldNamedChildRecs & __topLevelOldRecDef;
  332. // Create a list of embedded records and child dataset definitions using the
  333. // the recommended ECL datatypes
  334. LOCAL __bestNamedChildRecs0 := LOOP
  335. (
  336. DATASET([], __ChildRecLayout),
  337. MAX(__fieldInfo20, depth),
  338. __ProcessChildRecs(ROWS(LEFT), MAX(__fieldInfo20, depth) + 1 - COUNTER, TRUE)
  339. );
  340. LOCAL __bestNamedChildRecs := SORT(__bestNamedChildRecs0, endPos, -startPos);
  341. LOCAL __topLevelBestRecDef := DATASET
  342. (
  343. [
  344. {
  345. __LAYOUT_NAME,
  346. 0,
  347. 0,
  348. 0,
  349. __MakeRecDefinition(__fieldInfo20(depth = 0), __LAYOUT_NAME, TRUE)
  350. }
  351. ],
  352. __ChildRecLayout
  353. );
  354. LOCAL __allBestRecDefs := __bestNamedChildRecs & __topLevelBestRecDef;
  355. // Creates an ECL TRANSFORM function based on the collected information
  356. // about a record definition
  357. LOCAL __MakeTransforms(__ChildRecLayout recInfo) := FUNCTION
  358. RETURN (+)
  359. (
  360. DATASET(['New' + recInfo.layoutName + ' Make_New' + recInfo.layoutName + '(Old' + recInfo.layoutName + ' r) := TRANSFORM'], __StringRec),
  361. PROJECT
  362. (
  363. DISTRIBUTE(recInfo.items, 0),
  364. TRANSFORM
  365. (
  366. __StringRec,
  367. assignment := LEFT.bestAssignment;
  368. SELF.s := IF(assignment != '', assignment, SKIP)
  369. )
  370. ),
  371. DATASET([' SELF := r;'], __StringRec),
  372. DATASET(['END;'], __StringRec),
  373. ORDERED(TRUE)
  374. );
  375. END;
  376. LOCAL __allTransforms := PROJECT
  377. (
  378. __allBestRecDefs,
  379. TRANSFORM
  380. (
  381. {
  382. DATASET(__StringRec) lines
  383. },
  384. SELF.lines := __MakeTransforms(LEFT)
  385. )
  386. );
  387. // Create a dataset of STRINGS that contain record definitions for the
  388. // input dataset, TRANSFORMs for converting between the old and new
  389. // definitions, and a sample PROJECT for kicking it all off
  390. LOCAL __conditionalBR := #IF((BOOLEAN)textOutput) '<br/>' #ELSE '' #END;
  391. LOCAL __oldRecDefsPlusTransforms := (+)
  392. (
  393. DATASET(['//----------' + __conditionalBR], __StringRec),
  394. PROJECT(__allOldRecDefs.items, __StringRec),
  395. DATASET(['//----------' + __conditionalBR], __StringRec),
  396. __allTransforms.lines,
  397. DATASET(['//----------' + __conditionalBR], __StringRec),
  398. DATASET(['oldDS := DATASET([], OldLayout);' + __conditionalBR], __StringRec),
  399. DATASET(['newDS := PROJECT(oldDS, Make_NewLayout(LEFT));' + __conditionalBR], __StringRec),
  400. ORDERED(TRUE)
  401. );
  402. // Combine old definitions and transforms conditionally
  403. LOCAL __conditionalOldStuff :=
  404. #IF((BOOLEAN)emitTransform)
  405. __oldRecDefsPlusTransforms
  406. #ELSE
  407. DATASET([], __StringRec)
  408. #END;
  409. LOCAL __allOutput := PROJECT(__allBestRecDefs.items, __StringRec) & __conditionalOldStuff;
  410. // Roll everything up to one string with HTML line breaks
  411. LOCAL __htmlString := ROLLUP
  412. (
  413. __allOutput,
  414. TRUE,
  415. TRANSFORM
  416. (
  417. RECORDOF(LEFT),
  418. rightString := IF(RIGHT.s = 'END;', RIGHT.s + '<br/>', RIGHT.s);
  419. SELF.s := LEFT.s + '<br/>' + rightString
  420. )
  421. );
  422. // Stuff the HTML result into a single record, wrapped with <pre> so it
  423. // looks right in the browser
  424. LOCAL __htmlResult := DATASET(['<pre>' + __htmlString[1].s + '</pre>'], {STRING result__html});
  425. // Choose the result (dataset with each line a string, or a text blob)
  426. LOCAL __finalResult := #IF((BOOLEAN)textOutput) __htmlResult #ELSE __allOutput #END;
  427. RETURN __finalResult;
  428. ENDMACRO;