Browse Source

Merge pull request #5428 from AttilaVamos/HPCC-10298-improvement

HPCC-10298 CSV auto discovery should check the validity and uniqueness of fields

Reviewed-By: Gavin Halliday <gavin.halliday@lexisnexis.com>
Reviewed-By: Richard Chapman <rchapman@hpccsystems.com>
Richard Chapman 11 years ago
parent
commit
e6cba386ed
2 changed files with 60 additions and 10 deletions
  1. 58 10
      dali/ft/daftformat.cpp
  2. 2 0
      dali/ft/daftformat.ipp

+ 58 - 10
dali/ft/daftformat.cpp

@@ -589,6 +589,7 @@ CCsvPartitioner::CCsvPartitioner(const FileFormat & _format) : CInputBasePartiti
     isRecordStructurePresent = false;
     fieldCount = 0;
     isFirstRow = true;
+    fields.setown(new KeptAtomTable);
 }
 
 void CCsvPartitioner::storeFieldName(const char * start, unsigned len)
@@ -608,17 +609,40 @@ void CCsvPartitioner::storeFieldName(const char * start, unsigned len)
 
     if (isRecordStructurePresent && (0 < fieldName.length() ))
     {
-        fieldName.replace('-', '_');
-        fieldName.replace(' ', '_');
+        // Check discovered field name validity
+        char act = fieldName.charAt(0);
+        if ( !(isalpha(act) || act == '_') )
+        {
+            fieldName.setCharAt(0, '_');
+        }
 
-        recordStructure.append(fieldName);
+        for ( int i = 1; i < fieldName.length(); i++)
+        {
+            act = fieldName.charAt(i);
+            if ( !(isalnum(act) || act == '_' || act == '$') )
+            {
+                fieldName.setCharAt(i, '_');
+            }
+        }
     }
     else
     {
-        recordStructure.append("field");
-        recordStructure.append(fieldCount);
+        fieldName.append("field").append(fieldCount);
     }
+
+    // Check discovered field name uniqueness
+    const char * fn = fieldName.toCharArray();
+    if ( fields->find(fn) != NULL )
+    {
+        time_t t;
+        time(&t);
+        fieldName.append('_').append(fieldCount).append('_').append((unsigned)t);
+    }
+
+    recordStructure.append(fieldName);
     recordStructure.append(";\n");
+
+    fields->addAtom(fieldName.toCharArray());
 }
 
 size32_t CCsvPartitioner::getSplitRecordSize(const byte * start, unsigned maxToRead, bool processFullBuffer, bool ateof)
@@ -890,6 +914,7 @@ CUtfPartitioner::CUtfPartitioner(const FileFormat & _format) : CInputBasePartiti
     isRecordStructurePresent = false;
     fieldCount = 0;
     isFirstRow = true;
+    fields.setown(new KeptAtomTable);
 }
 
 void CUtfPartitioner::storeFieldName(const char * start, unsigned len)
@@ -913,17 +938,40 @@ void CUtfPartitioner::storeFieldName(const char * start, unsigned len)
 
     if (isRecordStructurePresent && (0 < fieldName.length() ))
     {
-        fieldName.replace('-', '_');
-        fieldName.replace(' ', '_');
+        // Check discovered field name validity
+        char act = fieldName.charAt(0);
+        if ( !(isalpha(act) || act == '_') )
+        {
+            fieldName.setCharAt(0, '_');
+        }
 
-        recordStructure.append(fieldName);
+        for ( int i = 1; i < fieldName.length(); i++)
+        {
+            act = fieldName.charAt(i);
+            if ( !(isalnum(act) || act == '_' || act == '$') )
+            {
+                fieldName.setCharAt(i, '_');
+            }
+        }
     }
     else
     {
-        recordStructure.append("field");
-        recordStructure.append(fieldCount);
+        fieldName.append("field").append(fieldCount);
     }
+
+    // Check discovered field name uniqueness
+    const char * fn = fieldName.toCharArray();
+    if ( fields->find(fn) != NULL )
+    {
+        time_t t;
+        time(&t);
+        fieldName.append('_').append(fieldCount).append('_').append((unsigned)t);
+    }
+
+    recordStructure.append(fieldName);
     recordStructure.append(";\n");
+
+    fields->addAtom(fieldName.toCharArray());
 }
 
 size32_t CUtfPartitioner::getSplitRecordSize(const byte * start, unsigned maxToRead, bool processFullBuffer, bool ateof)

+ 2 - 0
dali/ft/daftformat.ipp

@@ -259,6 +259,7 @@ protected:
     bool            isRecordStructurePresent;
     StringBuffer    recordStructure;
     unsigned        fieldCount;
+    Owned<KeptAtomTable> fields;
     bool            isFirstRow;
 };
 
@@ -314,6 +315,7 @@ protected:
     bool            isRecordStructurePresent;
     StringBuffer    recordStructure;
     unsigned        fieldCount;
+    Owned<KeptAtomTable> fields;
     bool            isFirstRow;
 };