فهرست منبع

Fix gh-1047, replace invalid characters in query names when publishing

Query names have to be valid xml tags in order to call them via soap.

Adds UTF based functions in junicode for creating names that should
be valid xml tags according to the W3C spec.  This takes a step
towards being more flexible in our unicode handling as well.

Signed-off-by: Anthony Fishbeck <Anthony.Fishbeck@lexisnexis.com>
Anthony Fishbeck 13 سال پیش
والد
کامیت
e58de9cd30
3فایلهای تغییر یافته به همراه108 افزوده شده و 11 حذف شده
  1. 3 3
      common/workunit/workunit.cpp
  2. 94 8
      system/jlib/junicode.cpp
  3. 11 0
      system/jlib/junicode.hpp

+ 3 - 3
common/workunit/workunit.cpp

@@ -25,6 +25,7 @@
 #include "jptree.hpp"
 #include "jtime.ipp"
 #include "jencrypt.hpp"
+#include "junicode.hpp"
 #include "eclrtl.hpp"
 #include "deftype.hpp"
 #include <time.h>
@@ -8618,7 +8619,6 @@ extern WORKUNIT_API void addExceptionToWorkunit(IWorkUnit * wu, WUExceptionSever
     }
 }
 
-#define UTF8_BOM    "\357\273\277"
 
 extern WORKUNIT_API bool isArchiveQuery(const char * text)
 {
@@ -8938,8 +8938,8 @@ extern WORKUNIT_API IPropertyTree * getPackageSetRegistry(const char * wsEclId,
 
 void addQueryToQuerySet(IWorkUnit *workunit, const char *querySetName, const char *queryName, IPropertyTree *packageInfo, WUQueryActivationOptions activateOption, StringBuffer &newQueryId)
 {
-    StringBuffer cleanQueryName(queryName);
-    cleanQueryName.replace(' ', '_');
+    StringBuffer cleanQueryName;
+    appendUtf8XmlName(cleanQueryName, strlen(queryName), queryName);
 
     SCMStringBuffer dllName;
     Owned<IConstWUQuery> q = workunit->getQuery();

+ 94 - 8
system/jlib/junicode.cpp

@@ -470,6 +470,18 @@ MemoryBuffer & appendUtf32be(MemoryBuffer & out, UTF32 value)
     return out.append(writeUtf32be(temp, sizeof(temp), value), temp);
 }
 
+MemoryBuffer & appendUtf(MemoryBuffer & out, UtfReader::UtfFormat targetType, UTF32 value)
+{
+    switch (targetType)
+    {
+    case UtfReader::Utf8:    appendUtf8(out, value); break;
+    case UtfReader::Utf16le: appendUtf16le(out, value); break;
+    case UtfReader::Utf16be: appendUtf16be(out, value); break;
+    case UtfReader::Utf32le: appendUtf32le(out, value); break;
+    case UtfReader::Utf32be: appendUtf32be(out, value); break;
+    }
+    return out;
+}
 
 /* ---------------------------------------------------------------------
 
@@ -506,14 +518,7 @@ bool convertUtf(MemoryBuffer & target, UtfReader::UtfFormat targetType, unsigned
             target.setLength(originalLength);
             return false;
         }
-        switch (targetType)
-        {
-        case UtfReader::Utf8:    appendUtf8(target, next); break;
-        case UtfReader::Utf16le: appendUtf16le(target, next); break;
-        case UtfReader::Utf16be: appendUtf16be(target, next); break;
-        case UtfReader::Utf32le: appendUtf32le(target, next); break;
-        case UtfReader::Utf32be: appendUtf32be(target, next); break;
-        }
+        appendUtf(target, targetType, next);
     }
 }
 
@@ -622,3 +627,84 @@ void addUtfActionList(StringMatcher & matcher, const char * text, unsigned actio
     }
 }
 
+extern jlib_decl bool replaceUtf(utfReplacementFunc func, MemoryBuffer & target, UtfReader::UtfFormat type, unsigned sourceLength, const void * source)
+{
+    UtfReader input(type, false);
+    input.set(sourceLength, source);
+    unsigned originalLength = target.length();
+    loop
+    {
+        const byte * cur = input.cur;
+        UTF32 next = input.next();
+        if (next == sourceExhausted)
+            return true;
+        if (next == sourceIllegal)
+        {
+            target.setLength(originalLength);
+            return false;
+        }
+        func(target, next, type, cur, input.cur-cur, cur==source);
+    }
+}
+
+struct utf32ValidXmlCharRange
+{
+    UTF32 min;
+    UTF32 max;
+    bool start;
+};
+
+utf32ValidXmlCharRange utf32ValidXmlCharRanges[] = {
+    {'0', '9', false},
+    {'A', 'Z', true},
+    {'a', 'z', true},
+    {0xC0, 0xD6, true},
+    {0xD8, 0xF6, true},
+    {0xF8, 0x2FF, true},
+    {0x300, 0x36F, false},
+    {0x370, 0x37D, true},
+    {0x37F, 0x1FFF, true},
+    {0x200C, 0x200D, true},
+    {0x203F, 0x2040, false},
+    {0x2070, 0x218F, true},
+    {0x2C00, 0x2FEF, true},
+    {0x3001, 0xD7FF, true},
+    {0xF900, 0xFDCF, true},
+    {0xFDF0, 0xFFFD, true},
+    {0x10000, 0xEFFFF, true},
+    {0, 0, false}
+};
+
+inline bool replaceBelowRange(UTF32 match, UTF32 replace, int id, MemoryBuffer & target, UtfReader::UtfFormat type, const void * source, int len, bool start)
+{
+    utf32ValidXmlCharRange &r = utf32ValidXmlCharRanges[id];
+    if (r.min==0)
+        return true;
+    if (match>r.max)
+        return false;
+    if (match<r.min)
+    {
+        appendUtf(target, type, replace);
+        return true;
+    }
+    if (!r.start && start)
+        appendUtf(target, type, replace);
+    else
+        target.append(len, source); //src and target are same, no need to reconvert
+    return true;
+}
+
+MemoryBuffer & utfXmlNameReplacementFunc(MemoryBuffer & target, UTF32 match, UtfReader::UtfFormat type, const void * source, int len, bool start)
+{
+    if (match==':' || match=='_' || (!start && (match=='-' || match=='.' || match==0xB7)))
+        return target.append(len, source);
+
+    for (int i=0; !replaceBelowRange(match, '_', i, target, type, source, len, start); i++);
+
+    return target;
+}
+
+extern jlib_decl bool appendUtfXmlName(MemoryBuffer & target, UtfReader::UtfFormat type, unsigned sourceLength, const void * source)
+{
+    return replaceUtf(utfXmlNameReplacementFunc, target, type, sourceLength, source);
+}

+ 11 - 0
system/jlib/junicode.hpp

@@ -78,5 +78,16 @@ extern jlib_decl UTF32 readUtf8Character(unsigned len, const byte * & cur);
 extern jlib_decl size32_t readUtf8Size(const void * _data);
 extern jlib_decl UTF32 readUtf8Char(const void * _data);
 
+typedef MemoryBuffer & (*utfReplacementFunc)(MemoryBuffer & target, UTF32 match, UtfReader::UtfFormat type, const void * source, int len, bool start);
+extern jlib_decl bool replaceUtf(utfReplacementFunc func, MemoryBuffer & target, UtfReader::UtfFormat type, unsigned sourceLength, const void * source);
+extern jlib_decl bool appendUtfXmlName(MemoryBuffer & target, UtfReader::UtfFormat type, unsigned sourceLength, const void * source);
+
+inline StringBuffer &appendUtf8XmlName(StringBuffer & target, unsigned sourceLength, const void * source)
+{
+    MemoryBuffer mb;
+    appendUtfXmlName(mb, UtfReader::Utf8, sourceLength, source);
+    return target.append(mb.length(), mb.toByteArray());
+}
+
 
 #endif