BestRecordStructure.ecl 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461
  1. /***
  2. * Function macro that leverages DataPatterns to return a string defining the
  3. * best ECL record structure for the input data.
  4. *
  5. * @param inFile The dataset to process; REQUIRED
  6. * @param sampling A positive integer representing a percentage of
  7. * inFile to examine, which is useful when analyzing a
  8. * very large dataset and only an estimatation is
  9. * sufficient; valid range for this argument is
  10. * 1-100; values outside of this range will be
  11. * clamped; OPTIONAL, defaults to 100 (which indicates
  12. * that the entire dataset will be analyzed)
  13. * @param emitTransform Boolean governing whether the function emits a
  14. * TRANSFORM function that could be used to rewrite
  15. * the dataset into the 'best' record definition;
  16. * OPTIONAL, defaults to FALSE.
  17. * @param textOutput Boolean governing the type of result that is
  18. * delivered by this function; if FALSE then a
  19. * recordset of STRINGs will be returned; if TRUE
  20. * then a dataset with a single STRING field, with
  21. * the contents formatted for HTML, will be
  22. * returned (this is the ideal output if the
  23. * intention is to copy the output from ECL Watch);
  24. * OPTIONAL, defaults to FALSE
  25. *
  26. * @return A recordset defining the best ECL record structure for the data.
  27. * If textOutput is FALSE (the default) then each record will contain
  28. * one field declaration, and the list of declarations will be wrapped
  29. * with RECORD and END strings; if the emitTransform argument was
  30. * TRUE, there will also be a set of records that that comprise a
  31. * stand-alone TRANSFORM function. If textOutput is TRUE then only
  32. * one record will be returned, containing an HTML-formatted string
  33. * containing the new field declarations (and optionally the
  34. * TRANSFORM); this is the ideal format if the intention is to copy
  35. * the result from ECL Watch.
  36. */
  37. EXPORT BestRecordStructure(inFile, sampling = 100, emitTransform = FALSE, textOutput = FALSE) := FUNCTIONMACRO
  38. LOADXML('<xml/>');
  39. #EXPORTXML(bestInFileFields, RECORDOF(inFile));
  40. #UNIQUENAME(bestFieldStack);
  41. #UNIQUENAME(bestStructType);
  42. #UNIQUENAME(bestLayoutType);
  43. #UNIQUENAME(bestCapturedPos);
  44. #UNIQUENAME(bestPrevCapturedPos);
  45. #UNIQUENAME(bestLayoutName);
  46. #UNIQUENAME(bestNeedsDelim);
  47. #UNIQUENAME(bestNamePrefix);
  48. #UNIQUENAME(recLevel);
  49. IMPORT DataPatterns;
  50. IMPORT Std;
  51. // Attribute naming note: In order to reduce symbol collisions with calling
  52. // code, all LOCAL attributes are prefixed with two underscore characters;
  53. // normally, a #UNIQUENAME would be used instead, but there is apparently
  54. // a problem with using that for ECL attributes when another function
  55. // macro is called (namely, Profile); using double underscores is not an
  56. // optimal solution but the chance of symbol collision should at least be
  57. // reduced
  58. LOCAL __DATAREC_NAME := 'DataRec';
  59. LOCAL __LAYOUT_NAME := 'Layout';
  60. LOCAL __StringRec := {STRING s};
  61. // Helper function for determining if old and new data types need
  62. // explicit type casting
  63. LOCAL __NeedCoercion(STRING oldType, STRING newType) := FUNCTION
  64. GenericType(STRING theType) := MAP
  65. (
  66. theType[..6] = 'string' => 'string',
  67. theType[..13] = 'ebcdic string' => 'string',
  68. theType[..7] = 'qstring' => 'string',
  69. theType[..9] = 'varstring' => 'string',
  70. theType[..3] = 'utf' => 'string',
  71. theType[..7] = 'unicode' => 'string',
  72. theType[..10] = 'varunicode' => 'string',
  73. theType[..4] = 'data' => 'data',
  74. theType[..7] = 'boolean' => 'boolean',
  75. theType[..7] = 'integer' => 'numeric',
  76. theType[..18] = 'big_endian integer' => 'numeric',
  77. theType[..4] = 'real' => 'numeric',
  78. theType[..7] = 'decimal' => 'numeric',
  79. theType[..8] = 'udecimal' => 'numeric',
  80. theType[..8] = 'unsigned' => 'numeric',
  81. theType[..19] = 'big_endian unsigned' => 'numeric',
  82. theType
  83. );
  84. oldGenericType := GenericType(Std.Str.ToLowerCase(oldType));
  85. newGenericType := GenericType(Std.Str.ToLowerCase(newType));
  86. RETURN oldGenericType != newGenericType;
  87. END;
  88. // Build a dataset containing information about embedded records and
  89. // child datasets; we need to track the beginning and ending positions
  90. // of the fields defined within those structures
  91. LOCAL __ChildRecInfoLayout := RECORD
  92. STRING layoutType;
  93. STRING layoutName;
  94. STRING fieldName;
  95. UNSIGNED2 startPos;
  96. UNSIGNED2 endPos;
  97. END;
  98. LOCAL __childRecInfo := DATASET
  99. (
  100. [
  101. #SET(bestFieldStack, '')
  102. #SET(bestNeedsDelim, 0)
  103. #FOR(bestInFileFields)
  104. #FOR(Field)
  105. #IF(%{@isRecord}% = 1 OR %{@isDataset}% = 1)
  106. #IF(%{@isRecord}% = 1)
  107. #SET(bestStructType, 'r')
  108. #ELSE
  109. #SET(bestStructType, 'd')
  110. #END
  111. #IF(%'bestFieldStack'% != '')
  112. #SET(bestFieldStack, ';' + %'bestFieldStack'%)
  113. #END
  114. #SET(bestFieldStack, %'bestStructType'% + ':' + %'@position'% + ':' + %'@ecltype'% + %'bestFieldStack'%)
  115. #ELSEIF(%{@isEnd}% = 1)
  116. #SET(bestLayoutType, %'bestFieldStack'%[1])
  117. #SET(bestCapturedPos, REGEXFIND('.:(\\d+)', %'bestFieldStack'%, 1))
  118. #SET(bestLayoutName, REGEXFIND('.:\\d+:([^;]+)', %'bestFieldStack'%, 1))
  119. #SET(bestFieldStack, REGEXFIND('^[^;]+;(.*)', %'bestFieldStack'%, 1))
  120. #IF(%bestNeedsDelim% = 1) , #END
  121. {
  122. %'bestLayoutType'%,
  123. %'bestLayoutName'%,
  124. %'@name'%,
  125. %bestCapturedPos%,
  126. %bestPrevCapturedPos%
  127. }
  128. #SET(bestNeedsDelim, 1)
  129. #ELSE
  130. #SET(bestPrevCapturedPos, %@position%)
  131. #END
  132. #END
  133. #END
  134. ],
  135. __ChildRecInfoLayout
  136. );
  137. // Extract the original data type and position of the fields within the
  138. // input dataset
  139. LOCAL __FieldInfoLayout := RECORD
  140. STRING eclType;
  141. STRING name;
  142. STRING fullName;
  143. BOOLEAN isRecord;
  144. BOOLEAN isDataset;
  145. UNSIGNED2 depth;
  146. UNSIGNED2 position;
  147. END;
  148. LOCAL __fieldInfo0 := DATASET
  149. (
  150. [
  151. #SET(bestFieldStack, '')
  152. #SET(bestNeedsDelim, 0)
  153. #SET(bestNamePrefix, '')
  154. #SET(recLevel, 0)
  155. #FOR(bestInFileFields)
  156. #FOR(Field)
  157. #IF(%@isEnd% != 1)
  158. #IF(%bestNeedsDelim% = 1) , #END
  159. {
  160. %'@ecltype'%,
  161. %'@name'%,
  162. %'bestNamePrefix'% + %'@name'%,
  163. #IF(%@isRecord% = 1) TRUE #ELSE FALSE #END,
  164. #IF(%@isDataset% = 1) TRUE #ELSE FALSE #END,
  165. %recLevel%,
  166. %@position%
  167. }
  168. #SET(bestNeedsDelim, 1)
  169. #END
  170. #IF(%{@isRecord}% = 1 OR %{@isDataset}% = 1)
  171. #APPEND(bestNamePrefix, %'@name'% + '.')
  172. #SET(recLevel, %recLevel% + 1)
  173. #ELSEIF(%{@isEnd}% = 1)
  174. #SET(bestNamePrefix, REGEXREPLACE('\\w+\\.$', %'bestNamePrefix'%, ''))
  175. #SET(recLevel, %recLevel% - 1)
  176. #END
  177. #END
  178. #END
  179. ],
  180. __FieldInfoLayout
  181. );
  182. // Attach the record end positions for embedded records and child datasets
  183. LOCAL __fieldInfo10 := JOIN
  184. (
  185. __fieldInfo0,
  186. __childRecInfo,
  187. LEFT.name = RIGHT.fieldName AND LEFT.position = RIGHT.startPos,
  188. TRANSFORM
  189. (
  190. {
  191. RECORDOF(LEFT),
  192. UNSIGNED2 endPosition
  193. },
  194. SELF.endPosition := RIGHT.endPos,
  195. SELF := LEFT
  196. ),
  197. LEFT OUTER
  198. );
  199. // Get the best data types from the Profile() function
  200. LOCAL __patternRes := DataPatterns.Profile(inFile, features := 'best_ecl_types', sampleSize := sampling);
  201. // Append the derived 'best' data types to the field information we
  202. // already collected
  203. LOCAL __fieldInfo15 := JOIN
  204. (
  205. __fieldInfo10,
  206. __patternRes,
  207. LEFT.fullName = RIGHT.attribute,
  208. TRANSFORM
  209. (
  210. {
  211. RECORDOF(LEFT),
  212. STRING bestAttributeType
  213. },
  214. SELF.bestAttributeType := IF(RIGHT.best_attribute_type != '', Std.Str.ToUpperCase(RIGHT.best_attribute_type), LEFT.eclType),
  215. SELF := LEFT
  216. ),
  217. LEFT OUTER
  218. );
  219. // Determine fields that must have explicit coercion if we are supplying
  220. // transform information
  221. LOCAL __fieldInfo20 := PROJECT
  222. (
  223. __fieldInfo15,
  224. TRANSFORM
  225. (
  226. {
  227. RECORDOF(LEFT),
  228. STRING bestAssignment
  229. },
  230. shouldRewriteType := ((LEFT.isDataset OR LEFT.isRecord) AND LEFT.bestAttributeType IN ['<unnamed>', 'table of <unnamed>']);
  231. tempDSName := __DATAREC_NAME + '_' + INTFORMAT(COUNTER, 4, 1);
  232. SELF.eclType := IF(NOT shouldRewriteType, Std.Str.ToUpperCase(LEFT.eclType), tempDSName),
  233. SELF.bestAttributeType := IF(NOT shouldRewriteType, LEFT.bestAttributeType, tempDSName),
  234. SELF.bestAssignment := IF
  235. (
  236. __NeedCoercion(SELF.eclType, SELF.bestAttributeType),
  237. ' SELF.' + LEFT.name + ' := (' + Std.Str.ToUppercase(SELF.bestAttributeType) + ')r.' + LEFT.name + ';',
  238. ''
  239. ),
  240. SELF := LEFT
  241. )
  242. );
  243. LOCAL __LayoutItems := RECORD(__StringRec)
  244. STRING fullName {DEFAULT('')};
  245. STRING bestAssignment {DEFAULT('')};
  246. END;
  247. LOCAL __ChildRecLayout := RECORD
  248. STRING layoutName;
  249. UNSIGNED2 startPos;
  250. UNSIGNED2 endPos;
  251. UNSIGNED2 depth;
  252. DATASET(__LayoutItems) items;
  253. END;
  254. // Function for creating ECL TRANSFORM assignment statements
  255. LOCAL __MakeRecDefinition(DATASET(RECORDOF(__fieldInfo20)) ds, STRING layoutName, BOOLEAN useBest = TRUE) := FUNCTION
  256. displayPrefix := IF(useBest, 'New', 'Old');
  257. displayedLayoutName := displayPrefix + layoutName;
  258. RETURN DATASET([{displayedLayoutName + ' := RECORD'}], __LayoutItems)
  259. & PROJECT
  260. (
  261. DISTRIBUTE(SORT(ds, position), 0),
  262. TRANSFORM
  263. (
  264. __LayoutItems,
  265. attrType := IF(useBest, LEFT.bestAttributeType, LEFT.eclType);
  266. attrPrefix := IF(LEFT.isDataset OR LEFT.isRecord, displayPrefix, '');
  267. fullAttrType := attrPrefix + attrType;
  268. namedDataType := IF(NOT LEFT.isDataset, fullAttrType, 'DATASET(' + fullAttrType + ')');
  269. SELF.s := ' ' + namedDataType + ' ' + LEFT.name + ';',
  270. SELF.bestAssignment := MAP
  271. (
  272. LEFT.bestAssignment != '' => LEFT.bestAssignment,
  273. LEFT.isRecord => ' SELF.' + LEFT.name + ' := ROW(Make_' + fullAttrType + '(r.' + LEFT.name + '));',
  274. LEFT.isDataset => ' SELF.' + LEFT.name + ' := PROJECT(r.' + LEFT.name + ', Make_' + fullAttrType + '(LEFT));',
  275. ''
  276. ),
  277. SELF := LEFT
  278. )
  279. )
  280. & DATASET([{'END;'}], __LayoutItems);
  281. END;
  282. // Iteratively process embedded records and child dataset definitions,
  283. // extracting each into its own record
  284. LOCAL __ProcessChildRecs(DATASET(__ChildRecLayout) layoutDS, UNSIGNED2 aDepth, BOOLEAN useBest = TRUE) := FUNCTION
  285. __bestNamedChildRecs := DENORMALIZE
  286. (
  287. __fieldInfo20(depth = (aDepth - 1) AND (isRecord OR isDataset)),
  288. __fieldInfo20(depth = aDepth),
  289. RIGHT.position BETWEEN LEFT.position + 1 AND LEFT.endPosition,
  290. GROUP,
  291. TRANSFORM
  292. (
  293. __ChildRecLayout,
  294. SELF.layoutName := LEFT.bestAttributeType,
  295. SELF.items := __MakeRecDefinition(ROWS(RIGHT), SELF.layoutName, useBest),
  296. SELF.startPos := LEFT.position,
  297. SELF.endPos := LEFT.endPosition,
  298. SELF.depth := aDepth,
  299. SELF := LEFT
  300. ),
  301. ALL, ORDERED(TRUE)
  302. ) : ONWARNING(4531, IGNORE);
  303. RETURN layoutDS + __bestNamedChildRecs;
  304. END;
  305. // Create a list of embedded records and child dataset definitions for the
  306. // original input dataset
  307. LOCAL __oldNamedChildRecs0 := LOOP
  308. (
  309. DATASET([], __ChildRecLayout),
  310. MAX(__fieldInfo20, depth),
  311. __ProcessChildRecs(ROWS(LEFT), MAX(__fieldInfo20, depth) + 1 - COUNTER, FALSE)
  312. );
  313. LOCAL __oldNamedChildRecs := SORT(__oldNamedChildRecs0, endPos, -startPos);
  314. LOCAL __topLevelOldRecDef := DATASET
  315. (
  316. [
  317. {
  318. __LAYOUT_NAME,
  319. 0,
  320. 0,
  321. 0,
  322. __MakeRecDefinition(__fieldInfo20(depth = 0), __LAYOUT_NAME, FALSE)
  323. }
  324. ],
  325. __ChildRecLayout
  326. );
  327. LOCAL __allOldRecDefs := __oldNamedChildRecs & __topLevelOldRecDef;
  328. // Create a list of embedded records and child dataset definitions using the
  329. // the recommended ECL datatypes
  330. LOCAL __bestNamedChildRecs0 := LOOP
  331. (
  332. DATASET([], __ChildRecLayout),
  333. MAX(__fieldInfo20, depth),
  334. __ProcessChildRecs(ROWS(LEFT), MAX(__fieldInfo20, depth) + 1 - COUNTER, TRUE)
  335. );
  336. LOCAL __bestNamedChildRecs := SORT(__bestNamedChildRecs0, endPos, -startPos);
  337. LOCAL __topLevelBestRecDef := DATASET
  338. (
  339. [
  340. {
  341. __LAYOUT_NAME,
  342. 0,
  343. 0,
  344. 0,
  345. __MakeRecDefinition(__fieldInfo20(depth = 0), __LAYOUT_NAME, TRUE)
  346. }
  347. ],
  348. __ChildRecLayout
  349. );
  350. LOCAL __allBestRecDefs := __bestNamedChildRecs & __topLevelBestRecDef;
  351. // Creates an ECL TRANSFORM function based on the collected information
  352. // about a record definition
  353. LOCAL __MakeTransforms(__ChildRecLayout recInfo) := FUNCTION
  354. RETURN DATASET(['New' + recInfo.layoutName + ' Make_New' + recInfo.layoutName + '(Old' + recInfo.layoutName + ' r) := TRANSFORM'], __StringRec)
  355. & PROJECT
  356. (
  357. DISTRIBUTE(recInfo.items, 0),
  358. TRANSFORM
  359. (
  360. __StringRec,
  361. assignment := LEFT.bestAssignment;
  362. SELF.s := IF(assignment != '', assignment, SKIP)
  363. )
  364. )
  365. & DATASET([' SELF := r;'], __StringRec)
  366. & DATASET(['END;'], __StringRec);
  367. END;
  368. LOCAL __allTransforms := PROJECT
  369. (
  370. __allBestRecDefs,
  371. TRANSFORM
  372. (
  373. {
  374. DATASET(__StringRec) lines
  375. },
  376. SELF.lines := __MakeTransforms(LEFT)
  377. )
  378. );
  379. // Create a dataset of STRINGS that contain record definitions for the
  380. // input dataset, TRANSFORMs for converting between the old and new
  381. // definitions, and a sample PROJECT for kicking it all off
  382. LOCAL __conditionalBR := #IF((BOOLEAN)textOutput) '<br/>' #ELSE '' #END;
  383. LOCAL __oldRecDefsPlusTransforms := DATASET(['//----------' + __conditionalBR], __StringRec)
  384. & PROJECT(__allOldRecDefs.items, __StringRec)
  385. & DATASET(['//----------' + __conditionalBR], __StringRec)
  386. & __allTransforms.lines
  387. & DATASET(['//----------' + __conditionalBR], __StringRec)
  388. & DATASET(['oldDS := DATASET([], OldLayout);' + __conditionalBR], __StringRec)
  389. & DATASET(['newDS := PROJECT(oldDS, Make_NewLayout(LEFT));' + __conditionalBR], __StringRec);
  390. // Combine old definitions and transforms conditionally
  391. LOCAL __conditionalOldStuff :=
  392. #IF((BOOLEAN)emitTransform)
  393. __oldRecDefsPlusTransforms
  394. #ELSE
  395. DATASET([], __StringRec)
  396. #END;
  397. LOCAL __allOutput := PROJECT(__allBestRecDefs.items, __StringRec) & __conditionalOldStuff;
  398. // Roll everything up to one string with HTML line breaks
  399. LOCAL __htmlString := ROLLUP
  400. (
  401. __allOutput,
  402. TRUE,
  403. TRANSFORM
  404. (
  405. RECORDOF(LEFT),
  406. rightString := IF(RIGHT.s = 'END;', RIGHT.s + '<br/>', RIGHT.s);
  407. SELF.s := LEFT.s + '<br/>' + rightString
  408. )
  409. );
  410. // Stuff the HTML result into a single record, wrapped with <pre> so it
  411. // looks right in the browser
  412. LOCAL __htmlResult := DATASET(['<pre>' + __htmlString[1].s + '</pre>'], {STRING result__html});
  413. // Choose the result (dataset with each line a string, or a text blob)
  414. LOCAL __finalResult := #IF((BOOLEAN)textOutput) __htmlResult #ELSE __allOutput #END;
  415. RETURN __finalResult;
  416. ENDMACRO;