Ver código fonte

HPCC-23730 Bump std.DataPatterns to v1.6.3

Signed-off-by: Gordon Smith <gordonjsmith@gmail.com>
Gordon Smith 5 anos atrás
pai
commit
293b6e8c1b

+ 2 - 2
ecllibrary/std/DataPatterns/BestRecordStructure.ecl

@@ -282,7 +282,7 @@ EXPORT BestRecordStructure(inFile, sampling = 100, emitTransform = FALSE, textOu
         RETURN DATASET([{displayedLayoutName + ' := RECORD'}], __LayoutItems)
             & PROJECT
                 (
-                    SORT(ds, position),
+                    DISTRIBUTE(SORT(ds, position), 0),
                     TRANSFORM
                         (
                             __LayoutItems,
@@ -389,7 +389,7 @@ EXPORT BestRecordStructure(inFile, sampling = 100, emitTransform = FALSE, textOu
         RETURN DATASET(['New' + recInfo.layoutName + ' Make_New' + recInfo.layoutName + '(Old' + recInfo.layoutName + ' r) := TRANSFORM'], __StringRec)
             & PROJECT
                 (
-                    recInfo.items,
+                    DISTRIBUTE(recInfo.items, 0),
                     TRANSFORM
                         (
                             __StringRec,

+ 65 - 34
ecllibrary/std/DataPatterns/Profile.ecl

@@ -53,42 +53,43 @@
  *      popular_patterns        The most common patterns of values; see below
  *      rare_patterns           The least common patterns of values; see below
  *      is_numeric              Boolean indicating if the original attribute
- *                              was a numeric scalar and therefore whether or
- *                              not the numeric_xxxx output fields will be
+ *                              was a numeric scalar or if the best_attribute_type
+ *                              value was a numeric scaler; if TRUE then the
+ *                              numeric_xxxx output fields will be
  *                              populated with actual values; if this value
  *                              is FALSE then all numeric_xxxx output values
  *                              should be ignored
  *      numeric_min             The smallest non-nil value found within the
- *                              attribute as a DECIMAL; the attribute must be
- *                              a numeric ECL datatype; non-numeric attributes
- *                              will return zero
+ *                              attribute as a DECIMAL; this value is valid only
+ *                              if is_numeric is TRUE; if is_numeric is FALSE
+ *                              then zero will show here
  *      numeric_max             The largest non-nil value found within the
- *                              attribute as a DECIMAL; the attribute must be
- *                              a numeric ECL datatype; non-numeric attributes
- *                              will return zero
+ *                              attribute as a DECIMAL;this value is valid only
+ *                              if is_numeric is TRUE; if is_numeric is FALSE
+ *                              then zero will show here
  *      numeric_mean            The mean (average) non-nil value found within
- *                              the attribute as a DECIMAL; the attribute must
- *                              be a numeric ECL datatype; non-numeric
- *                              attributes will return zero
+ *                              the attribute as a DECIMAL; this value is valid
+ *                              only if is_numeric is TRUE; if is_numeric is FALSE
+ *                              then zero will show here
  *      numeric_std_dev         The standard deviation of the non-nil values
- *                              in the attribute as a DECIMAL; the attribute
- *                              must be a numeric ECL datatype; non-numeric
- *                              attributes will return zero
+ *                              in the attribute as a DECIMAL; this value is valid
+ *                              only if is_numeric is TRUE; if is_numeric is FALSE
+ *                              then zero will show here
  *      numeric_lower_quartile  The value separating the first (bottom) and
  *                              second quarters of non-nil values within
- *                              the attribute as a DECIMAL; the attribute must
- *                              be a numeric ECL datatype; non-numeric
- *                              attributes will return zero
+ *                              the attribute as a DECIMAL; this value is valid only
+ *                              if is_numeric is TRUE; if is_numeric is FALSE
+ *                              then zero will show here
  *      numeric_median          The median non-nil value within the attribute
- *                              as a DECIMAL; the attribute must be a numeric
- *                              ECL datatype; non-numeric attributes will return
- *                              zero
+ *                              as a DECIMAL; this value is valid only
+ *                              if is_numeric is TRUE; if is_numeric is FALSE
+ *                              then zero will show here
  *      numeric_upper_quartile  The value separating the third and fourth
  *                              (top) quarters of non-nil values within
- *                              the attribute as a DECIMAL; the attribute must
- *                              be a numeric ECL datatype; non-numeric
- *                              attributes will return zero
- *      numeric_correlations    A child dataset containing correlation values
+ *                              the attribute as a DECIMAL; this value is valid only
+ *                              if is_numeric is TRUE; if is_numeric is FALSE
+ *                              then zero will show here
+ *      correlations            A child dataset containing correlation values
  *                              comparing the current numeric attribute with all
  *                              other numeric attributes, listed in descending
  *                              correlation value order; the attribute must be
@@ -159,7 +160,7 @@
  *                              quartiles               numeric_lower_quartile
  *                                                      numeric_median
  *                                                      numeric_upper_quartile
- *                              correlations            numeric_correlations
+ *                              correlations            correlations
  *                          To omit the output associated with a single keyword,
  *                          set this argument to a comma-delimited string
  *                          containing all other keywords; note that the
@@ -399,7 +400,7 @@ EXPORT Profile(inFile,
         %NumericStat_t%                 numeric_lower_quartile;
         %NumericStat_t%                 numeric_median;
         %NumericStat_t%                 numeric_upper_quartile;
-        DATASET(CorrelationRec)         numeric_correlations {MAXCOUNT(%fieldCount%)};
+        DATASET(CorrelationRec)         correlations {MAXCOUNT(%fieldCount%)};
     END;
 
     // Define the record layout that will be returned to the caller; note
@@ -454,7 +455,7 @@ EXPORT Profile(inFile,
             %NumericStat_t%             numeric_upper_quartile;
         #END
         #IF(%FeatureEnabledCorrelations%())
-            DATASET(CorrelationRec)     numeric_correlations;
+            DATASET(CorrelationRec)     correlations;
         #END
     END;
 
@@ -887,6 +888,21 @@ EXPORT Profile(inFile,
                     )
             );
 
+        #UNIQUENAME(filledDataInfoNumeric);
+        LOCAL %filledDataInfoNumeric% := JOIN
+            (
+                %filledDataInfo%,
+                %attributeBestTypeInfo%,
+                LEFT.attribute = RIGHT.attribute,
+                TRANSFORM
+                    (
+                        RECORDOF(LEFT),
+                        SELF.is_number := LEFT.is_number OR (REGEXFIND('(integer)|(unsigned)|(decimal)|(real)', RIGHT.best_attribute_type) AND NOT REGEXFIND('set of ', RIGHT.best_attribute_type)),
+                        SELF := LEFT
+                    ),
+                LEFT OUTER, KEEP(1), SMART
+            ) : ONWARNING(4531, IGNORE);
+
         // Build a set of attributes for quartiles, unique values, and modes for
         // each processed attribute
         #SET(recLevel, 0);
@@ -915,7 +931,7 @@ EXPORT Profile(inFile,
                         #UNIQUENAME(uniqueNumericValueCounts)
                         %uniqueNumericValueCounts% := PROJECT
                             (
-                                %filledDataInfo%(attribute = %'namePrefix'% + %'@name'% AND is_number),
+                                %filledDataInfoNumeric%(attribute = %'namePrefix'% + %'@name'% AND is_number),
                                 TRANSFORM
                                     (
                                         {
@@ -992,7 +1008,7 @@ EXPORT Profile(inFile,
                         // each occurs in the data
                         LOCAL #EXPAND(%_MakeAttr%(%'namePrefix'% + %'@name'% + '_uniq_value_recs')) := TABLE
                             (
-                                %filledDataInfo%(attribute = %'namePrefix'% + %'@name'%),
+                                %filledDataInfoNumeric%(attribute = %'namePrefix'% + %'@name'%),
                                 {
                                     string_value,
                                     UNSIGNED4 rec_count := SUM(GROUP, value_count)
@@ -1149,7 +1165,7 @@ EXPORT Profile(inFile,
         #UNIQUENAME(dataPatternStats0);
         LOCAL %dataPatternStats0% := PROJECT
             (
-                %filledDataInfo%,
+                %filledDataInfoNumeric%,
                 TRANSFORM
                     (
                         RECORDOF(LEFT),
@@ -1191,7 +1207,7 @@ EXPORT Profile(inFile,
         #UNIQUENAME(dataLengthStats);
         LOCAL %dataLengthStats% := TABLE
             (
-                %filledDataInfo%,
+                %filledDataInfoNumeric%,
                 {
                     attribute,
                     UNSIGNED4   min_length := MIN(GROUP, data_length),
@@ -1403,12 +1419,27 @@ EXPORT Profile(inFile,
                 %final20%
             #END;
 
+        #UNIQUENAME(final35);
+        LOCAL %final35% := JOIN
+            (
+                %final30%,
+                %attributeBestTypeInfo%,
+                LEFT.attribute = RIGHT.attribute,
+                TRANSFORM
+                    (
+                        RECORDOF(LEFT),
+                        SELF.is_numeric := LEFT.is_numeric OR (REGEXFIND('(integer)|(unsigned)|(decimal)|(real)', RIGHT.best_attribute_type) AND NOT REGEXFIND('set of ', RIGHT.best_attribute_type)),
+                        SELF := LEFT
+                    ),
+                LEFT OUTER, KEEP(1), SMART
+            ) : ONWARNING(4531, IGNORE);
+
         #UNIQUENAME(final40);
         LOCAL %final40% :=
             #IF(%FeatureEnabledPatterns%())
                 DENORMALIZE
                     (
-                        %final30%,
+                        %final35%,
                         %topDataPatterns%,
                         LEFT.attribute = RIGHT.attribute,
                         GROUP,
@@ -1421,7 +1452,7 @@ EXPORT Profile(inFile,
                         LEFT OUTER, SMART
                     ) : ONWARNING(4531, IGNORE)
             #ELSE
-                %final30%
+                %final35%
             #END;
 
         #UNIQUENAME(final50);
@@ -1457,7 +1488,7 @@ EXPORT Profile(inFile,
                             TRANSFORM
                                 (
                                     RECORDOF(LEFT),
-                                    SELF.numeric_correlations := SORT
+                                    SELF.correlations := SORT
                                         (
                                             PROJECT
                                                 (

+ 8 - 7
ecllibrary/teststd/DataPatterns/TestDataPatterns.ecl

@@ -57,7 +57,7 @@ EXPORT TestDataPatterns := MODULE
             ASSERT(Basic_String_Profile[1].numeric_lower_quartile = 0),
             ASSERT(Basic_String_Profile[1].numeric_median = 0),
             ASSERT(Basic_String_Profile[1].numeric_upper_quartile = 0),
-            ASSERT(COUNT(Basic_String_Profile[1].numeric_correlations) = 0),
+            ASSERT(COUNT(Basic_String_Profile[1].correlations) = 0),
             ASSERT(TRUE)
         ];
 
@@ -108,7 +108,7 @@ EXPORT TestDataPatterns := MODULE
             ASSERT(Basic_Numeric_Profile[1].numeric_lower_quartile = -1000),
             ASSERT(Basic_Numeric_Profile[1].numeric_median = 500),
             ASSERT(Basic_Numeric_Profile[1].numeric_upper_quartile = 2000),
-            ASSERT(COUNT(Basic_Numeric_Profile[1].numeric_correlations) = 0),
+            ASSERT(COUNT(Basic_Numeric_Profile[1].correlations) = 0),
             ASSERT(TRUE)
         ];
 
@@ -116,7 +116,8 @@ EXPORT TestDataPatterns := MODULE
     // Empty data detection
     //--------------------------------------------------------------------------
 
-    // Layout contains every ECL data type that Profile can deal with
+    // Layout contains every ECL data type that Profile can process (except for
+    // SET OF datatypes, child records, and child datasets)
     SHARED EmptyDataLayout := RECORD
         BOOLEAN f_boolean;
         INTEGER f_integer;
@@ -426,7 +427,7 @@ EXPORT TestDataPatterns := MODULE
             ASSERT(Embedded_Child1_Profile(attribute = 'foo.x')[1].numeric_lower_quartile = 123),
             ASSERT(Embedded_Child1_Profile(attribute = 'foo.x')[1].numeric_median = 555),
             ASSERT(Embedded_Child1_Profile(attribute = 'foo.x')[1].numeric_upper_quartile = 0),
-            ASSERT(COUNT(Embedded_Child1_Profile(attribute = 'foo.x')[1].numeric_correlations) = 2),
+            ASSERT(COUNT(Embedded_Child1_Profile(attribute = 'foo.x')[1].correlations) = 2),
             ASSERT(Embedded_Child1_Profile(attribute = 'foo.y')[1].attribute = 'foo.y'),
             ASSERT(Embedded_Child1_Profile(attribute = 'foo.y')[1].rec_count = 2),
             ASSERT(Embedded_Child1_Profile(attribute = 'foo.y')[1].given_attribute_type = 'unsigned4'),
@@ -447,7 +448,7 @@ EXPORT TestDataPatterns := MODULE
             ASSERT(Embedded_Child1_Profile(attribute = 'foo.y')[1].numeric_lower_quartile = 345),
             ASSERT(Embedded_Child1_Profile(attribute = 'foo.y')[1].numeric_median = 555),
             ASSERT(Embedded_Child1_Profile(attribute = 'foo.y')[1].numeric_upper_quartile = 0),
-            ASSERT(COUNT(Embedded_Child1_Profile(attribute = 'foo.y')[1].numeric_correlations) = 2),
+            ASSERT(COUNT(Embedded_Child1_Profile(attribute = 'foo.y')[1].correlations) = 2),
             ASSERT(Embedded_Child1_Profile(attribute = 'foo.z')[1].attribute = 'foo.z'),
             ASSERT(Embedded_Child1_Profile(attribute = 'foo.z')[1].rec_count = 2),
             ASSERT(Embedded_Child1_Profile(attribute = 'foo.z')[1].given_attribute_type = 'unsigned4'),
@@ -468,7 +469,7 @@ EXPORT TestDataPatterns := MODULE
             ASSERT(Embedded_Child1_Profile(attribute = 'foo.z')[1].numeric_lower_quartile = 543),
             ASSERT(Embedded_Child1_Profile(attribute = 'foo.z')[1].numeric_median = 555),
             ASSERT(Embedded_Child1_Profile(attribute = 'foo.z')[1].numeric_upper_quartile = 0),
-            ASSERT(COUNT(Embedded_Child1_Profile(attribute = 'foo.z')[1].numeric_correlations) = 2),
+            ASSERT(COUNT(Embedded_Child1_Profile(attribute = 'foo.z')[1].correlations) = 2),
             ASSERT(TRUE)
         ];
 
@@ -576,7 +577,7 @@ EXPORT TestDataPatterns := MODULE
             ASSERT(SetOf_Types_Profile(attribute = 'my_set')[1].numeric_lower_quartile = 0),
             ASSERT(SetOf_Types_Profile(attribute = 'my_set')[1].numeric_median = 0),
             ASSERT(SetOf_Types_Profile(attribute = 'my_set')[1].numeric_upper_quartile = 0),
-            ASSERT(COUNT(SetOf_Types_Profile(attribute = 'my_set')[1].numeric_correlations) = 0),
+            ASSERT(COUNT(SetOf_Types_Profile(attribute = 'my_set')[1].correlations) = 0),
             ASSERT(TRUE)
         ];