Browse Source

HPCC-10298 CSV auto discovery should check the validity and uniqueness
of the discovered field name.

Add code for validate the filed name and check its uniqueness.
All invalid character change to '_'.

Use atom table to store fieldnames for checking.

If fieldname isn't unique then add '_'+<field index>+'_'+<timestamp>
postfix.

Signed-off-by: Attila Vamos <attila.vamos@gmail.com>

Attila Vamos 11 years ago
parent
commit
d00ce0ae8c
2 changed files with 8 additions and 34 deletions
  1. 6 32
      dali/ft/daftformat.cpp
  2. 2 2
      dali/ft/daftformat.ipp

+ 6 - 32
dali/ft/daftformat.cpp

@@ -589,6 +589,7 @@ CCsvPartitioner::CCsvPartitioner(const FileFormat & _format) : CInputBasePartiti
     isRecordStructurePresent = false;
     fieldCount = 0;
     isFirstRow = true;
+    fields.setown(new KeptAtomTable);
 }
 
 void CCsvPartitioner::storeFieldName(const char * start, unsigned len)
@@ -625,21 +626,8 @@ void CCsvPartitioner::storeFieldName(const char * start, unsigned len)
         }
 
         // Check discovered field name uniqueness
-        bool alreadyExist = false;
         const char * fn = fieldName.toCharArray();
-        ForEachItemIn(idx, fields)
-        {
-            StringAttrItem & field = fields.item(idx);
-
-            int result = strcmp(field.text.get(), fn);
-            if ( !result )
-            {
-                alreadyExist = true;
-                break;
-            }
-        }
-
-        if ( alreadyExist )
+        if ( fields->find(fn) != NULL )
         {
             time_t t;
             time(&t);
@@ -654,8 +642,7 @@ void CCsvPartitioner::storeFieldName(const char * start, unsigned len)
     recordStructure.append(fieldName);
     recordStructure.append(";\n");
 
-    StringAttrItem *field = new StringAttrItem(fieldName.toCharArray());
-    fields.append(*field);
+    fields->addAtom(fieldName.toCharArray());
 }
 
 size32_t CCsvPartitioner::getSplitRecordSize(const byte * start, unsigned maxToRead, bool processFullBuffer, bool ateof)
@@ -927,6 +914,7 @@ CUtfPartitioner::CUtfPartitioner(const FileFormat & _format) : CInputBasePartiti
     isRecordStructurePresent = false;
     fieldCount = 0;
     isFirstRow = true;
+    fields.setown(new KeptAtomTable);
 }
 
 void CUtfPartitioner::storeFieldName(const char * start, unsigned len)
@@ -967,21 +955,8 @@ void CUtfPartitioner::storeFieldName(const char * start, unsigned len)
         }
 
         // Check discovered field name uniqueness
-        bool alreadyExist = false;
         const char * fn = fieldName.toCharArray();
-        ForEachItemIn(idx, fields)
-        {
-            StringAttrItem & field = fields.item(idx);
-
-            int result = strcmp(field.text.get(), fn);
-            if( !result )
-            {
-                alreadyExist = true;
-                break;
-            }
-        }
-
-        if ( alreadyExist )
+        if ( fields->find(fn) != NULL )
         {
             time_t t;
             time(&t);
@@ -996,8 +971,7 @@ void CUtfPartitioner::storeFieldName(const char * start, unsigned len)
     recordStructure.append(fieldName);
     recordStructure.append(";\n");
 
-    StringAttrItem *field = new StringAttrItem(fieldName.toCharArray());
-    fields.append(*field);
+    fields->addAtom(fieldName.toCharArray());
 }
 
 size32_t CUtfPartitioner::getSplitRecordSize(const byte * start, unsigned maxToRead, bool processFullBuffer, bool ateof)

+ 2 - 2
dali/ft/daftformat.ipp

@@ -259,7 +259,7 @@ protected:
     bool            isRecordStructurePresent;
     StringBuffer    recordStructure;
     unsigned        fieldCount;
-    StringAttrArray fields;
+    Owned<KeptAtomTable> fields;
     bool            isFirstRow;
 };
 
@@ -315,7 +315,7 @@ protected:
     bool            isRecordStructurePresent;
     StringBuffer    recordStructure;
     unsigned        fieldCount;
-    StringAttrArray fields;
+    Owned<KeptAtomTable> fields;
     bool            isFirstRow;
 };