BestRecordStructure.ecl 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452
  1. /***
  2. * Function macro that leverages DataPatterns to return a string defining the
  3. * best ECL record structure for the input data.
  4. *
  5. * @param inFile The dataset to process; REQUIRED
  6. * @param sampling A positive integer representing a percentage of
  7. * inFile to examine, which is useful when analyzing a
  8. * very large dataset and only an estimatation is
  9. * sufficient; valid range for this argument is
  10. * 1-100; values outside of this range will be
  11. * clamped; OPTIONAL, defaults to 100 (which indicates
  12. * that the entire dataset will be analyzed)
  13. * @param emitTransform Boolean governing whether the function emits a
  14. * TRANSFORM function that could be used to rewrite
  15. * the dataset into the 'best' record definition;
  16. * OPTIONAL, defaults to FALSE.
  17. * @param textOutput Boolean governing the type of result that is
  18. * delivered by this function; if FALSE then a
  19. * recordset of STRINGs will be returned; if TRUE
  20. * then a dataset with a single STRING field, with
  21. * the contents formatted for HTML, will be
  22. * returned (this is the ideal output if the
  23. * intention is to copy the output from ECL Watch);
  24. * OPTIONAL, defaults to FALSE
  25. *
  26. * @return A recordset defining the best ECL record structure for the data.
  27. * If textOutput is FALSE (the default) then each record will contain
  28. * one field declaration, and the list of declarations will be wrapped
  29. * with RECORD and END strings; if the emitTransform argument was
  30. * TRUE, there will also be a set of records that that comprise a
  31. * stand-alone TRANSFORM function. If textOutput is TRUE then only
  32. * one record will be returned, containing an HTML-formatted string
  33. * containing the new field declarations (and optionally the
  34. * TRANSFORM); this is the ideal format if the intention is to copy
  35. * the result from ECL Watch.
  36. */
  37. EXPORT BestRecordStructure(inFile, sampling = 100, emitTransform = FALSE, textOutput = FALSE) := FUNCTIONMACRO
  38. LOADXML('<xml/>');
  39. #EXPORTXML(bestInFileFields, RECORDOF(inFile));
  40. #UNIQUENAME(bestFieldStack);
  41. #UNIQUENAME(bestStructType);
  42. #UNIQUENAME(bestLayoutType);
  43. #UNIQUENAME(bestCapturedPos);
  44. #UNIQUENAME(bestPrevCapturedPos);
  45. #UNIQUENAME(bestLayoutName);
  46. #UNIQUENAME(bestNeedsDelim);
  47. #UNIQUENAME(bestNamePrefix);
  48. #UNIQUENAME(recLevel);
  49. IMPORT Std;
  50. LOCAL DATAREC_NAME := 'DataRec';
  51. LOCAL LAYOUT_NAME := 'Layout';
  52. LOCAL StringRec := {STRING s};
  53. // Helper function for determining if old and new data types need
  54. // explicit type casting
  55. LOCAL NeedCoercion(STRING oldType, STRING newType) := FUNCTION
  56. GenericType(STRING theType) := MAP
  57. (
  58. theType[..6] = 'string' => 'string',
  59. theType[..13] = 'ebcdic string' => 'string',
  60. theType[..7] = 'qstring' => 'string',
  61. theType[..9] = 'varstring' => 'string',
  62. theType[..3] = 'utf' => 'string',
  63. theType[..7] = 'unicode' => 'string',
  64. theType[..10] = 'varunicode' => 'string',
  65. theType[..4] = 'data' => 'data',
  66. theType[..7] = 'boolean' => 'boolean',
  67. theType[..7] = 'integer' => 'numeric',
  68. theType[..18] = 'big_endian integer' => 'numeric',
  69. theType[..4] = 'real' => 'numeric',
  70. theType[..7] = 'decimal' => 'numeric',
  71. theType[..8] = 'udecimal' => 'numeric',
  72. theType[..8] = 'unsigned' => 'numeric',
  73. theType[..19] = 'big_endian unsigned' => 'numeric',
  74. theType
  75. );
  76. oldGenericType := GenericType(Std.Str.ToLowerCase(oldType));
  77. newGenericType := GenericType(Std.Str.ToLowerCase(newType));
  78. RETURN oldGenericType != newGenericType;
  79. END;
  80. // Build a dataset containing information about embedded records and
  81. // child datasets; we need to track the beginning and ending positions
  82. // of the fields defined within those structures
  83. LOCAL ChildRecInfoLayout := RECORD
  84. STRING layoutType;
  85. STRING layoutName;
  86. STRING fieldName;
  87. UNSIGNED2 startPos;
  88. UNSIGNED2 endPos;
  89. END;
  90. LOCAL childRecInfo := DATASET
  91. (
  92. [
  93. #SET(bestFieldStack, '')
  94. #SET(bestNeedsDelim, 0)
  95. #FOR(bestInFileFields)
  96. #FOR(Field)
  97. #IF(%{@isRecord}% = 1 OR %{@isDataset}% = 1)
  98. #IF(%{@isRecord}% = 1)
  99. #SET(bestStructType, 'r')
  100. #ELSE
  101. #SET(bestStructType, 'd')
  102. #END
  103. #IF(%'bestFieldStack'% != '')
  104. #SET(bestFieldStack, ';' + %'bestFieldStack'%)
  105. #END
  106. #SET(bestFieldStack, %'bestStructType'% + ':' + %'@position'% + ':' + %'@ecltype'% + %'bestFieldStack'%)
  107. #ELSEIF(%{@isEnd}% = 1)
  108. #SET(bestLayoutType, %'bestFieldStack'%[1])
  109. #SET(bestCapturedPos, REGEXFIND('.:(\\d+)', %'bestFieldStack'%, 1))
  110. #SET(bestLayoutName, REGEXFIND('.:\\d+:([^;]+)', %'bestFieldStack'%, 1))
  111. #SET(bestFieldStack, REGEXFIND('^[^;]+;(.*)', %'bestFieldStack'%, 1))
  112. #IF(%bestNeedsDelim% = 1) , #END
  113. {
  114. %'bestLayoutType'%,
  115. %'bestLayoutName'%,
  116. %'@name'%,
  117. %bestCapturedPos%,
  118. %bestPrevCapturedPos%
  119. }
  120. #SET(bestNeedsDelim, 1)
  121. #ELSE
  122. #SET(bestPrevCapturedPos, %@position%)
  123. #END
  124. #END
  125. #END
  126. ],
  127. ChildRecInfoLayout
  128. );
  129. // Extract the original data type and position of the fields within the
  130. // input dataset
  131. LOCAL FieldInfoLayout := RECORD
  132. STRING eclType;
  133. STRING name;
  134. STRING fullName;
  135. BOOLEAN isRecord;
  136. BOOLEAN isDataset;
  137. UNSIGNED2 depth;
  138. UNSIGNED2 position;
  139. END;
  140. LOCAL fieldInfo0 := DATASET
  141. (
  142. [
  143. #SET(bestFieldStack, '')
  144. #SET(bestNeedsDelim, 0)
  145. #SET(bestNamePrefix, '')
  146. #SET(recLevel, 0)
  147. #FOR(bestInFileFields)
  148. #FOR(Field)
  149. #IF(%@isEnd% != 1)
  150. #IF(%bestNeedsDelim% = 1) , #END
  151. {
  152. %'@ecltype'%,
  153. %'@name'%,
  154. %'bestNamePrefix'% + %'@name'%,
  155. #IF(%@isRecord% = 1) TRUE #ELSE FALSE #END,
  156. #IF(%@isDataset% = 1) TRUE #ELSE FALSE #END,
  157. %recLevel%,
  158. %@position%
  159. }
  160. #SET(bestNeedsDelim, 1)
  161. #END
  162. #IF(%{@isRecord}% = 1 OR %{@isDataset}% = 1)
  163. #APPEND(bestNamePrefix, %'@name'% + '.')
  164. #SET(recLevel, %recLevel% + 1)
  165. #ELSEIF(%{@isEnd}% = 1)
  166. #SET(bestNamePrefix, REGEXREPLACE('\\w+\\.$', %'bestNamePrefix'%, ''))
  167. #SET(recLevel, %recLevel% - 1)
  168. #END
  169. #END
  170. #END
  171. ],
  172. FieldInfoLayout
  173. );
  174. // Attach the record end positions for embedded records and child datasets
  175. LOCAL fieldInfo10 := JOIN
  176. (
  177. fieldInfo0,
  178. childRecInfo,
  179. LEFT.name = RIGHT.fieldName AND LEFT.position = RIGHT.startPos,
  180. TRANSFORM
  181. (
  182. {
  183. RECORDOF(LEFT),
  184. UNSIGNED2 endPosition
  185. },
  186. SELF.endPosition := RIGHT.endPos,
  187. SELF := LEFT
  188. ),
  189. LEFT OUTER
  190. );
  191. // Get the best data types from the Profile() function
  192. LOCAL patternRes := DataPatterns.Profile(inFile, features := 'best_ecl_types', sampleSize := sampling);
  193. // Append the derived 'best' data types to the field information we
  194. // already collected
  195. LOCAL fieldInfo15 := JOIN
  196. (
  197. fieldInfo10,
  198. patternRes,
  199. LEFT.fullName = RIGHT.attribute,
  200. TRANSFORM
  201. (
  202. {
  203. RECORDOF(LEFT),
  204. STRING bestAttributeType
  205. },
  206. SELF.bestAttributeType := IF(RIGHT.best_attribute_type != '', Std.Str.ToUpperCase(RIGHT.best_attribute_type), LEFT.eclType),
  207. SELF := LEFT
  208. ),
  209. LEFT OUTER
  210. );
  211. // Determine fields that must have explicit coercion if we are supplying
  212. // transform information
  213. LOCAL fieldInfo20 := PROJECT
  214. (
  215. fieldInfo15,
  216. TRANSFORM
  217. (
  218. {
  219. RECORDOF(LEFT),
  220. STRING bestAssignment
  221. },
  222. shouldRewriteType := ((LEFT.isDataset OR LEFT.isRecord) AND LEFT.bestAttributeType IN ['<unnamed>', 'table of <unnamed>']);
  223. tempDSName := DATAREC_NAME + '_' + INTFORMAT(COUNTER, 4, 1);
  224. SELF.eclType := IF(NOT shouldRewriteType, Std.Str.ToUpperCase(LEFT.eclType), tempDSName),
  225. SELF.bestAttributeType := IF(NOT shouldRewriteType, LEFT.bestAttributeType, tempDSName),
  226. SELF.bestAssignment := IF
  227. (
  228. NeedCoercion(SELF.eclType, SELF.bestAttributeType),
  229. ' SELF.' + LEFT.name + ' := (' + Std.Str.ToUppercase(SELF.bestAttributeType) + ')r.' + LEFT.name + ';',
  230. ''
  231. ),
  232. SELF := LEFT
  233. )
  234. );
  235. LOCAL LayoutItems := RECORD(StringRec)
  236. STRING fullName := '';
  237. STRING bestAssignment := '';
  238. END;
  239. LOCAL ChildRecLayout := RECORD
  240. STRING layoutName;
  241. UNSIGNED2 startPos;
  242. UNSIGNED2 endPos;
  243. UNSIGNED2 depth;
  244. DATASET(LayoutItems) items;
  245. END;
  246. // Function for creating ECL TRANSFORM assignment statements
  247. LOCAL MakeRecDefinition(DATASET(RECORDOF(fieldInfo20)) ds, STRING layoutName, BOOLEAN useBest = TRUE) := FUNCTION
  248. displayPrefix := IF(useBest, 'New', 'Old');
  249. displayedLayoutName := displayPrefix + layoutName;
  250. RETURN DATASET([{displayedLayoutName + ' := RECORD'}], LayoutItems)
  251. & PROJECT
  252. (
  253. SORT(ds, position),
  254. TRANSFORM
  255. (
  256. LayoutItems,
  257. attrType := IF(useBest, LEFT.bestAttributeType, LEFT.eclType);
  258. attrPrefix := IF(LEFT.isDataset OR LEFT.isRecord, displayPrefix, '');
  259. fullAttrType := attrPrefix + attrType;
  260. namedDataType := IF(NOT LEFT.isDataset, fullAttrType, 'DATASET(' + fullAttrType + ')');
  261. SELF.s := ' ' + namedDataType + ' ' + LEFT.name + ';',
  262. SELF.bestAssignment := MAP
  263. (
  264. LEFT.bestAssignment != '' => LEFT.bestAssignment,
  265. LEFT.isRecord => ' SELF.' + LEFT.name + ' := ROW(Make_' + fullAttrType + '(r.' + LEFT.name + '));',
  266. LEFT.isDataset => ' SELF.' + LEFT.name + ' := PROJECT(r.' + LEFT.name + ', Make_' + fullAttrType + '(LEFT));',
  267. ''
  268. ),
  269. SELF := LEFT
  270. )
  271. )
  272. & DATASET([{'END;'}], LayoutItems);
  273. END;
  274. // Iteratively process embedded records and child dataset definitions,
  275. // extracting each into its own record
  276. LOCAL ProcessChildRecs(DATASET(ChildRecLayout) layoutDS, UNSIGNED2 aDepth, BOOLEAN useBest = TRUE) := FUNCTION
  277. bestNamedChildRecs := DENORMALIZE
  278. (
  279. fieldInfo20(depth = (aDepth - 1) AND (isRecord OR isDataset)),
  280. fieldInfo20(depth = aDepth),
  281. RIGHT.position BETWEEN LEFT.position + 1 AND LEFT.endPosition,
  282. GROUP,
  283. TRANSFORM
  284. (
  285. ChildRecLayout,
  286. SELF.layoutName := LEFT.bestAttributeType,
  287. SELF.items := MakeRecDefinition(ROWS(RIGHT), SELF.layoutName, useBest),
  288. SELF.startPos := LEFT.position,
  289. SELF.endPos := LEFT.endPosition,
  290. SELF.depth := aDepth,
  291. SELF := LEFT
  292. ),
  293. ALL, ORDERED(TRUE)
  294. ) : ONWARNING(4531, IGNORE);
  295. RETURN layoutDS + bestNamedChildRecs;
  296. END;
  297. // Create a list of embedded records and child dataset definitions for the
  298. // original input dataset
  299. LOCAL oldNamedChildRecs0 := LOOP
  300. (
  301. DATASET([], ChildRecLayout),
  302. MAX(fieldInfo20, depth),
  303. ProcessChildRecs(ROWS(LEFT), MAX(fieldInfo20, depth) + 1 - COUNTER, FALSE)
  304. );
  305. LOCAL oldNamedChildRecs := SORT(oldNamedChildRecs0, endPos, -startPos);
  306. LOCAL topLevelOldRecDef := DATASET
  307. (
  308. [
  309. {
  310. LAYOUT_NAME,
  311. 0,
  312. 0,
  313. 0,
  314. MakeRecDefinition(fieldInfo20(depth = 0), LAYOUT_NAME, FALSE)
  315. }
  316. ],
  317. ChildRecLayout
  318. );
  319. LOCAL allOldRecDefs := oldNamedChildRecs & topLevelOldRecDef;
  320. // Create a list of embedded records and child dataset definitions using the
  321. // the recommended ECL datatypes
  322. LOCAL bestNamedChildRecs0 := LOOP
  323. (
  324. DATASET([], ChildRecLayout),
  325. MAX(fieldInfo20, depth),
  326. ProcessChildRecs(ROWS(LEFT), MAX(fieldInfo20, depth) + 1 - COUNTER, TRUE)
  327. );
  328. LOCAL bestNamedChildRecs := SORT(bestNamedChildRecs0, endPos, -startPos);
  329. LOCAL topLevelBestRecDef := DATASET
  330. (
  331. [
  332. {
  333. LAYOUT_NAME,
  334. 0,
  335. 0,
  336. 0,
  337. MakeRecDefinition(fieldInfo20(depth = 0), LAYOUT_NAME, TRUE)
  338. }
  339. ],
  340. ChildRecLayout
  341. );
  342. LOCAL allBestRecDefs := bestNamedChildRecs & topLevelBestRecDef;
  343. // Creates an ECL TRANSFORM function based on the collected information
  344. // about a record definition
  345. LOCAL MakeTransforms(ChildRecLayout recInfo) := FUNCTION
  346. RETURN DATASET(['New' + recInfo.layoutName + ' Make_New' + recInfo.layoutName + '(Old' + recInfo.layoutName + ' r) := TRANSFORM'], StringRec)
  347. & PROJECT
  348. (
  349. recInfo.items,
  350. TRANSFORM
  351. (
  352. StringRec,
  353. assignment := LEFT.bestAssignment;
  354. SELF.s := IF(assignment != '', assignment, SKIP)
  355. )
  356. )
  357. & DATASET([' SELF := r;'], StringRec)
  358. & DATASET(['END;'], StringRec);
  359. END;
  360. LOCAL allTransforms := PROJECT
  361. (
  362. allBestRecDefs,
  363. TRANSFORM
  364. (
  365. {
  366. DATASET(StringRec) lines
  367. },
  368. SELF.lines := MakeTransforms(LEFT)
  369. )
  370. );
  371. // Create a dataset of STRINGS that contain record definitions for the
  372. // input dataset, TRANSFORMs for converting between the old and new
  373. // definitions, and a sample PROJECT for kicking it all off
  374. LOCAL conditionalBR := #IF((BOOLEAN)textOutput) '<br/>' #ELSE '' #END;
  375. LOCAL oldRecDefsPlusTransforms := DATASET(['//----------' + conditionalBR], StringRec)
  376. & PROJECT(allOldRecDefs.items, StringRec)
  377. & DATASET(['//----------' + conditionalBR], StringRec)
  378. & allTransforms.lines
  379. & DATASET(['//----------' + conditionalBR], StringRec)
  380. & DATASET(['oldDS := DATASET([], OldLayout);' + conditionalBR], StringRec)
  381. & DATASET(['newDS := PROJECT(oldDS, Make_NewLayout(LEFT));' + conditionalBR], StringRec);
  382. // Combine old definitions and transforms conditionally
  383. LOCAL conditionalOldStuff :=
  384. #IF((BOOLEAN)emitTransform)
  385. oldRecDefsPlusTransforms
  386. #ELSE
  387. DATASET([], StringRec)
  388. #END;
  389. LOCAL allOutput := PROJECT(allBestRecDefs.items, StringRec) & conditionalOldStuff;
  390. // Roll everything up to one string with HTML line breaks
  391. LOCAL htmlString := ROLLUP
  392. (
  393. allOutput,
  394. TRUE,
  395. TRANSFORM
  396. (
  397. RECORDOF(LEFT),
  398. rightString := IF(RIGHT.s = 'END;', RIGHT.s + '<br/>', RIGHT.s);
  399. SELF.s := LEFT.s + '<br/>' + rightString
  400. )
  401. );
  402. // Stuff the HTML result into a single record, wrapped with <pre> so it
  403. // looks right in the browser
  404. LOCAL htmlResult := DATASET(['<pre>' + htmlString[1].s + '</pre>'], {STRING result__html});
  405. // Choose the result (dataset with each line a string, or a text blob)
  406. LOCAL finalResult := #IF((BOOLEAN)textOutput) htmlResult #ELSE allOutput #END;
  407. RETURN finalResult;
  408. ENDMACRO;